diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl index 4673a5bc4..3e3fe573c 100644 --- a/OpenCL/inc_common.cl +++ b/OpenCL/inc_common.cl @@ -2944,244 +2944,156 @@ void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + switch (offset / 4) { case 0: - w3[3] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w3[2] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w3[1] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w3[0] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w2[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w2[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w2[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w1[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w1[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w1[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w0[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w0[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w0[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[0] = amd_bytealign (w0[0], 0, offset_minus_4); - - if (offset_mod_4 == 0) - { - w0[0] = w0[1]; - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } + w3[3] = amd_bytealign (w3[2], w3[3], offset); + w3[2] = amd_bytealign (w3[1], w3[2], offset); + w3[1] = amd_bytealign (w3[0], w3[1], offset); + w3[0] = amd_bytealign (w2[3], w3[0], offset); + w2[3] = amd_bytealign (w2[2], w2[3], offset); + w2[2] = amd_bytealign (w2[1], w2[2], offset); + w2[1] = amd_bytealign (w2[0], w2[1], offset); + w2[0] = amd_bytealign (w1[3], w2[0], offset); + w1[3] = amd_bytealign (w1[2], w1[3], offset); + w1[2] = amd_bytealign (w1[1], w1[2], offset); + w1[1] = amd_bytealign (w1[0], w1[1], offset); + w1[0] = amd_bytealign (w0[3], w1[0], offset); + w0[3] = amd_bytealign (w0[2], w0[3], offset); + w0[2] = amd_bytealign (w0[1], w0[2], offset); + w0[1] = amd_bytealign (w0[0], w0[1], offset); + w0[0] = amd_bytealign ( 0, w0[0], offset); break; case 1: - w3[3] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w3[2] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w3[1] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w3[0] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w2[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w2[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w1[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w1[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w0[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w0[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[1] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w3[1], w3[2], offset); + w3[2] = amd_bytealign (w3[0], w3[1], offset); + w3[1] = amd_bytealign (w2[3], w3[0], offset); + w3[0] = amd_bytealign (w2[2], w2[3], offset); + w2[3] = amd_bytealign (w2[1], w2[2], offset); + w2[2] = amd_bytealign (w2[0], w2[1], offset); + w2[1] = amd_bytealign (w1[3], w2[0], offset); + w2[0] = amd_bytealign (w1[2], w1[3], offset); + w1[3] = amd_bytealign (w1[1], w1[2], offset); + w1[2] = amd_bytealign (w1[0], w1[1], offset); + w1[1] = amd_bytealign (w0[3], w1[0], offset); + w1[0] = amd_bytealign (w0[2], w0[3], offset); + w0[3] = amd_bytealign (w0[1], w0[2], offset); + w0[2] = amd_bytealign (w0[0], w0[1], offset); + w0[1] = amd_bytealign ( 0, w0[0], offset); w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 2: - w3[3] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w3[2] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w3[1] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w3[0] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w2[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w1[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w0[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[2] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w3[0], w3[1], offset); + w3[2] = amd_bytealign (w2[3], w3[0], offset); + w3[1] = amd_bytealign (w2[2], w2[3], offset); + w3[0] = amd_bytealign (w2[1], w2[2], offset); + w2[3] = amd_bytealign (w2[0], w2[1], offset); + w2[2] = amd_bytealign (w1[3], w2[0], offset); + w2[1] = amd_bytealign (w1[2], w1[3], offset); + w2[0] = amd_bytealign (w1[1], w1[2], offset); + w1[3] = amd_bytealign (w1[0], w1[1], offset); + w1[2] = amd_bytealign (w0[3], w1[0], offset); + w1[1] = amd_bytealign (w0[2], w0[3], offset); + w1[0] = amd_bytealign (w0[1], w0[2], offset); + w0[3] = amd_bytealign (w0[0], w0[1], offset); + w0[2] = amd_bytealign ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 3: - w3[3] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w3[2] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w3[1] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w3[0] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[3] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w2[3], w3[0], offset); + w3[2] = amd_bytealign (w2[2], w2[3], offset); + w3[1] = amd_bytealign (w2[1], w2[2], offset); + w3[0] = amd_bytealign (w2[0], w2[1], offset); + w2[3] = amd_bytealign (w1[3], w2[0], offset); + w2[2] = amd_bytealign (w1[2], w1[3], offset); + w2[1] = amd_bytealign (w1[1], w1[2], offset); + w2[0] = amd_bytealign (w1[0], w1[1], offset); + w1[3] = amd_bytealign (w0[3], w1[0], offset); + w1[2] = amd_bytealign (w0[2], w0[3], offset); + w1[1] = amd_bytealign (w0[1], w0[2], offset); + w1[0] = amd_bytealign (w0[0], w0[1], offset); + w0[3] = amd_bytealign ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 4: - w3[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w3[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w3[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w3[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[0] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w2[2], w2[3], offset); + w3[2] = amd_bytealign (w2[1], w2[2], offset); + w3[1] = amd_bytealign (w2[0], w2[1], offset); + w3[0] = amd_bytealign (w1[3], w2[0], offset); + w2[3] = amd_bytealign (w1[2], w1[3], offset); + w2[2] = amd_bytealign (w1[1], w1[2], offset); + w2[1] = amd_bytealign (w1[0], w1[1], offset); + w2[0] = amd_bytealign (w0[3], w1[0], offset); + w1[3] = amd_bytealign (w0[2], w0[3], offset); + w1[2] = amd_bytealign (w0[1], w0[2], offset); + w1[1] = amd_bytealign (w0[0], w0[1], offset); + w1[0] = amd_bytealign ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 5: - w3[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w3[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w3[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w3[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[1] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w2[1], w2[2], offset); + w3[2] = amd_bytealign (w2[0], w2[1], offset); + w3[1] = amd_bytealign (w1[3], w2[0], offset); + w3[0] = amd_bytealign (w1[2], w1[3], offset); + w2[3] = amd_bytealign (w1[1], w1[2], offset); + w2[2] = amd_bytealign (w1[0], w1[1], offset); + w2[1] = amd_bytealign (w0[3], w1[0], offset); + w2[0] = amd_bytealign (w0[2], w0[3], offset); + w1[3] = amd_bytealign (w0[1], w0[2], offset); + w1[2] = amd_bytealign (w0[0], w0[1], offset); + w1[1] = amd_bytealign ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 6: - w3[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w3[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w3[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w3[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[2] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w2[0], w2[1], offset); + w3[2] = amd_bytealign (w1[3], w2[0], offset); + w3[1] = amd_bytealign (w1[2], w1[3], offset); + w3[0] = amd_bytealign (w1[1], w1[2], offset); + w2[3] = amd_bytealign (w1[0], w1[1], offset); + w2[2] = amd_bytealign (w0[3], w1[0], offset); + w2[1] = amd_bytealign (w0[2], w0[3], offset); + w2[0] = amd_bytealign (w0[1], w0[2], offset); + w1[3] = amd_bytealign (w0[0], w0[1], offset); + w1[2] = amd_bytealign ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -3189,32 +3101,18 @@ void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 7: - w3[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w3[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w3[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w3[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[3] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w1[3], w2[0], offset); + w3[2] = amd_bytealign (w1[2], w1[3], offset); + w3[1] = amd_bytealign (w1[1], w1[2], offset); + w3[0] = amd_bytealign (w1[0], w1[1], offset); + w2[3] = amd_bytealign (w0[3], w1[0], offset); + w2[2] = amd_bytealign (w0[2], w0[3], offset); + w2[1] = amd_bytealign (w0[1], w0[2], offset); + w2[0] = amd_bytealign (w0[0], w0[1], offset); + w1[3] = amd_bytealign ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -3223,30 +3121,17 @@ void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 8: - w3[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w3[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w3[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w3[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[0] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w1[2], w1[3], offset); + w3[2] = amd_bytealign (w1[1], w1[2], offset); + w3[1] = amd_bytealign (w1[0], w1[1], offset); + w3[0] = amd_bytealign (w0[3], w1[0], offset); + w2[3] = amd_bytealign (w0[2], w0[3], offset); + w2[2] = amd_bytealign (w0[1], w0[2], offset); + w2[1] = amd_bytealign (w0[0], w0[1], offset); + w2[0] = amd_bytealign ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -3256,28 +3141,16 @@ void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 9: - w3[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w3[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w3[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w3[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[1] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w1[1], w1[2], offset); + w3[2] = amd_bytealign (w1[0], w1[1], offset); + w3[1] = amd_bytealign (w0[3], w1[0], offset); + w3[0] = amd_bytealign (w0[2], w0[3], offset); + w2[3] = amd_bytealign (w0[1], w0[2], offset); + w2[2] = amd_bytealign (w0[0], w0[1], offset); + w2[1] = amd_bytealign ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -3288,26 +3161,15 @@ void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 10: - w3[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w3[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w3[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w3[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[2] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w1[0], w1[1], offset); + w3[2] = amd_bytealign (w0[3], w1[0], offset); + w3[1] = amd_bytealign (w0[2], w0[3], offset); + w3[0] = amd_bytealign (w0[1], w0[2], offset); + w2[3] = amd_bytealign (w0[0], w0[1], offset); + w2[2] = amd_bytealign ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -3319,24 +3181,14 @@ void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 11: - w3[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w3[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w3[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w3[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[3] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w0[3], w1[0], offset); + w3[2] = amd_bytealign (w0[2], w0[3], offset); + w3[1] = amd_bytealign (w0[1], w0[2], offset); + w3[0] = amd_bytealign (w0[0], w0[1], offset); + w2[3] = amd_bytealign ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -3349,22 +3201,13 @@ void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 12: - w3[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w3[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w3[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w3[0] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w0[2], w0[3], offset); + w3[2] = amd_bytealign (w0[1], w0[2], offset); + w3[1] = amd_bytealign (w0[0], w0[1], offset); + w3[0] = amd_bytealign ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -3378,20 +3221,12 @@ void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 13: - w3[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w3[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w3[1] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w0[1], w0[2], offset); + w3[2] = amd_bytealign (w0[0], w0[1], offset); + w3[1] = amd_bytealign ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -3406,18 +3241,11 @@ void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 14: - w3[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w3[2] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w0[0], w0[1], offset); + w3[2] = amd_bytealign ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -3433,16 +3261,10 @@ void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 15: - w3[3] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -3459,13 +3281,25 @@ void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[3] = 0; - } - break; } + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); #endif #ifdef IS_NV @@ -3802,6 +3636,519 @@ void switch_buffer_by_offset_carry_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + + switch (offset / 4) + { + case 0: + c0[0] = amd_bytealign (w3[3], 0, offset); + w3[3] = amd_bytealign (w3[2], w3[3], offset); + w3[2] = amd_bytealign (w3[1], w3[2], offset); + w3[1] = amd_bytealign (w3[0], w3[1], offset); + w3[0] = amd_bytealign (w2[3], w3[0], offset); + w2[3] = amd_bytealign (w2[2], w2[3], offset); + w2[2] = amd_bytealign (w2[1], w2[2], offset); + w2[1] = amd_bytealign (w2[0], w2[1], offset); + w2[0] = amd_bytealign (w1[3], w2[0], offset); + w1[3] = amd_bytealign (w1[2], w1[3], offset); + w1[2] = amd_bytealign (w1[1], w1[2], offset); + w1[1] = amd_bytealign (w1[0], w1[1], offset); + w1[0] = amd_bytealign (w0[3], w1[0], offset); + w0[3] = amd_bytealign (w0[2], w0[3], offset); + w0[2] = amd_bytealign (w0[1], w0[2], offset); + w0[1] = amd_bytealign (w0[0], w0[1], offset); + w0[0] = amd_bytealign ( 0, w0[0], offset); + + break; + + case 1: + c0[1] = amd_bytealign (w3[3], 0, offset); + c0[0] = amd_bytealign (w3[2], w3[3], offset); + w3[3] = amd_bytealign (w3[1], w3[2], offset); + w3[2] = amd_bytealign (w3[0], w3[1], offset); + w3[1] = amd_bytealign (w2[3], w3[0], offset); + w3[0] = amd_bytealign (w2[2], w2[3], offset); + w2[3] = amd_bytealign (w2[1], w2[2], offset); + w2[2] = amd_bytealign (w2[0], w2[1], offset); + w2[1] = amd_bytealign (w1[3], w2[0], offset); + w2[0] = amd_bytealign (w1[2], w1[3], offset); + w1[3] = amd_bytealign (w1[1], w1[2], offset); + w1[2] = amd_bytealign (w1[0], w1[1], offset); + w1[1] = amd_bytealign (w0[3], w1[0], offset); + w1[0] = amd_bytealign (w0[2], w0[3], offset); + w0[3] = amd_bytealign (w0[1], w0[2], offset); + w0[2] = amd_bytealign (w0[0], w0[1], offset); + w0[1] = amd_bytealign ( 0, w0[0], offset); + w0[0] = 0; + + break; + + case 2: + c0[2] = amd_bytealign (w3[3], 0, offset); + c0[1] = amd_bytealign (w3[2], w3[3], offset); + c0[0] = amd_bytealign (w3[1], w3[2], offset); + w3[3] = amd_bytealign (w3[0], w3[1], offset); + w3[2] = amd_bytealign (w2[3], w3[0], offset); + w3[1] = amd_bytealign (w2[2], w2[3], offset); + w3[0] = amd_bytealign (w2[1], w2[2], offset); + w2[3] = amd_bytealign (w2[0], w2[1], offset); + w2[2] = amd_bytealign (w1[3], w2[0], offset); + w2[1] = amd_bytealign (w1[2], w1[3], offset); + w2[0] = amd_bytealign (w1[1], w1[2], offset); + w1[3] = amd_bytealign (w1[0], w1[1], offset); + w1[2] = amd_bytealign (w0[3], w1[0], offset); + w1[1] = amd_bytealign (w0[2], w0[3], offset); + w1[0] = amd_bytealign (w0[1], w0[2], offset); + w0[3] = amd_bytealign (w0[0], w0[1], offset); + w0[2] = amd_bytealign ( 0, w0[0], offset); + w0[1] = 0; + w0[0] = 0; + + break; + + case 3: + c0[3] = amd_bytealign (w3[3], 0, offset); + c0[2] = amd_bytealign (w3[2], w3[3], offset); + c0[1] = amd_bytealign (w3[1], w3[2], offset); + c0[0] = amd_bytealign (w3[0], w3[1], offset); + w3[3] = amd_bytealign (w2[3], w3[0], offset); + w3[2] = amd_bytealign (w2[2], w2[3], offset); + w3[1] = amd_bytealign (w2[1], w2[2], offset); + w3[0] = amd_bytealign (w2[0], w2[1], offset); + w2[3] = amd_bytealign (w1[3], w2[0], offset); + w2[2] = amd_bytealign (w1[2], w1[3], offset); + w2[1] = amd_bytealign (w1[1], w1[2], offset); + w2[0] = amd_bytealign (w1[0], w1[1], offset); + w1[3] = amd_bytealign (w0[3], w1[0], offset); + w1[2] = amd_bytealign (w0[2], w0[3], offset); + w1[1] = amd_bytealign (w0[1], w0[2], offset); + w1[0] = amd_bytealign (w0[0], w0[1], offset); + w0[3] = amd_bytealign ( 0, w0[0], offset); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 4: + c1[0] = amd_bytealign (w3[3], 0, offset); + c0[3] = amd_bytealign (w3[2], w3[3], offset); + c0[2] = amd_bytealign (w3[1], w3[2], offset); + c0[1] = amd_bytealign (w3[0], w3[1], offset); + c0[0] = amd_bytealign (w2[3], w3[0], offset); + w3[3] = amd_bytealign (w2[2], w2[3], offset); + w3[2] = amd_bytealign (w2[1], w2[2], offset); + w3[1] = amd_bytealign (w2[0], w2[1], offset); + w3[0] = amd_bytealign (w1[3], w2[0], offset); + w2[3] = amd_bytealign (w1[2], w1[3], offset); + w2[2] = amd_bytealign (w1[1], w1[2], offset); + w2[1] = amd_bytealign (w1[0], w1[1], offset); + w2[0] = amd_bytealign (w0[3], w1[0], offset); + w1[3] = amd_bytealign (w0[2], w0[3], offset); + w1[2] = amd_bytealign (w0[1], w0[2], offset); + w1[1] = amd_bytealign (w0[0], w0[1], offset); + w1[0] = amd_bytealign ( 0, w0[0], offset); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 5: + c1[1] = amd_bytealign (w3[3], 0, offset); + c1[0] = amd_bytealign (w3[2], w3[3], offset); + c0[3] = amd_bytealign (w3[1], w3[2], offset); + c0[2] = amd_bytealign (w3[0], w3[1], offset); + c0[1] = amd_bytealign (w2[3], w3[0], offset); + c0[0] = amd_bytealign (w2[2], w2[3], offset); + w3[3] = amd_bytealign (w2[1], w2[2], offset); + w3[2] = amd_bytealign (w2[0], w2[1], offset); + w3[1] = amd_bytealign (w1[3], w2[0], offset); + w3[0] = amd_bytealign (w1[2], w1[3], offset); + w2[3] = amd_bytealign (w1[1], w1[2], offset); + w2[2] = amd_bytealign (w1[0], w1[1], offset); + w2[1] = amd_bytealign (w0[3], w1[0], offset); + w2[0] = amd_bytealign (w0[2], w0[3], offset); + w1[3] = amd_bytealign (w0[1], w0[2], offset); + w1[2] = amd_bytealign (w0[0], w0[1], offset); + w1[1] = amd_bytealign ( 0, w0[0], offset); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 6: + c1[2] = amd_bytealign (w3[3], 0, offset); + c1[1] = amd_bytealign (w3[2], w3[3], offset); + c1[0] = amd_bytealign (w3[1], w3[2], offset); + c0[3] = amd_bytealign (w3[0], w3[1], offset); + c0[2] = amd_bytealign (w2[3], w3[0], offset); + c0[1] = amd_bytealign (w2[2], w2[3], offset); + c0[0] = amd_bytealign (w2[1], w2[2], offset); + w3[3] = amd_bytealign (w2[0], w2[1], offset); + w3[2] = amd_bytealign (w1[3], w2[0], offset); + w3[1] = amd_bytealign (w1[2], w1[3], offset); + w3[0] = amd_bytealign (w1[1], w1[2], offset); + w2[3] = amd_bytealign (w1[0], w1[1], offset); + w2[2] = amd_bytealign (w0[3], w1[0], offset); + w2[1] = amd_bytealign (w0[2], w0[3], offset); + w2[0] = amd_bytealign (w0[1], w0[2], offset); + w1[3] = amd_bytealign (w0[0], w0[1], offset); + w1[2] = amd_bytealign ( 0, w0[0], offset); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 7: + c1[3] = amd_bytealign (w3[3], 0, offset); + c1[2] = amd_bytealign (w3[2], w3[3], offset); + c1[1] = amd_bytealign (w3[1], w3[2], offset); + c1[0] = amd_bytealign (w3[0], w3[1], offset); + c0[3] = amd_bytealign (w2[3], w3[0], offset); + c0[2] = amd_bytealign (w2[2], w2[3], offset); + c0[1] = amd_bytealign (w2[1], w2[2], offset); + c0[0] = amd_bytealign (w2[0], w2[1], offset); + w3[3] = amd_bytealign (w1[3], w2[0], offset); + w3[2] = amd_bytealign (w1[2], w1[3], offset); + w3[1] = amd_bytealign (w1[1], w1[2], offset); + w3[0] = amd_bytealign (w1[0], w1[1], offset); + w2[3] = amd_bytealign (w0[3], w1[0], offset); + w2[2] = amd_bytealign (w0[2], w0[3], offset); + w2[1] = amd_bytealign (w0[1], w0[2], offset); + w2[0] = amd_bytealign (w0[0], w0[1], offset); + w1[3] = amd_bytealign ( 0, w0[0], offset); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 8: + c2[0] = amd_bytealign (w3[3], 0, offset); + c1[3] = amd_bytealign (w3[2], w3[3], offset); + c1[2] = amd_bytealign (w3[1], w3[2], offset); + c1[1] = amd_bytealign (w3[0], w3[1], offset); + c1[0] = amd_bytealign (w2[3], w3[0], offset); + c0[3] = amd_bytealign (w2[2], w2[3], offset); + c0[2] = amd_bytealign (w2[1], w2[2], offset); + c0[1] = amd_bytealign (w2[0], w2[1], offset); + c0[0] = amd_bytealign (w1[3], w2[0], offset); + w3[3] = amd_bytealign (w1[2], w1[3], offset); + w3[2] = amd_bytealign (w1[1], w1[2], offset); + w3[1] = amd_bytealign (w1[0], w1[1], offset); + w3[0] = amd_bytealign (w0[3], w1[0], offset); + w2[3] = amd_bytealign (w0[2], w0[3], offset); + w2[2] = amd_bytealign (w0[1], w0[2], offset); + w2[1] = amd_bytealign (w0[0], w0[1], offset); + w2[0] = amd_bytealign ( 0, w0[0], offset); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 9: + c2[1] = amd_bytealign (w3[3], 0, offset); + c2[0] = amd_bytealign (w3[2], w3[3], offset); + c1[3] = amd_bytealign (w3[1], w3[2], offset); + c1[2] = amd_bytealign (w3[0], w3[1], offset); + c1[1] = amd_bytealign (w2[3], w3[0], offset); + c1[0] = amd_bytealign (w2[2], w2[3], offset); + c0[3] = amd_bytealign (w2[1], w2[2], offset); + c0[2] = amd_bytealign (w2[0], w2[1], offset); + c0[1] = amd_bytealign (w1[3], w2[0], offset); + c0[0] = amd_bytealign (w1[2], w1[3], offset); + w3[3] = amd_bytealign (w1[1], w1[2], offset); + w3[2] = amd_bytealign (w1[0], w1[1], offset); + w3[1] = amd_bytealign (w0[3], w1[0], offset); + w3[0] = amd_bytealign (w0[2], w0[3], offset); + w2[3] = amd_bytealign (w0[1], w0[2], offset); + w2[2] = amd_bytealign (w0[0], w0[1], offset); + w2[1] = amd_bytealign ( 0, w0[0], offset); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 10: + c2[2] = amd_bytealign (w3[3], 0, offset); + c2[1] = amd_bytealign (w3[2], w3[3], offset); + c2[0] = amd_bytealign (w3[1], w3[2], offset); + c1[3] = amd_bytealign (w3[0], w3[1], offset); + c1[2] = amd_bytealign (w2[3], w3[0], offset); + c1[1] = amd_bytealign (w2[2], w2[3], offset); + c1[0] = amd_bytealign (w2[1], w2[2], offset); + c0[3] = amd_bytealign (w2[0], w2[1], offset); + c0[2] = amd_bytealign (w1[3], w2[0], offset); + c0[1] = amd_bytealign (w1[2], w1[3], offset); + c0[0] = amd_bytealign (w1[1], w1[2], offset); + w3[3] = amd_bytealign (w1[0], w1[1], offset); + w3[2] = amd_bytealign (w0[3], w1[0], offset); + w3[1] = amd_bytealign (w0[2], w0[3], offset); + w3[0] = amd_bytealign (w0[1], w0[2], offset); + w2[3] = amd_bytealign (w0[0], w0[1], offset); + w2[2] = amd_bytealign ( 0, w0[0], offset); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 11: + c2[3] = amd_bytealign (w3[3], 0, offset); + c2[2] = amd_bytealign (w3[2], w3[3], offset); + c2[1] = amd_bytealign (w3[1], w3[2], offset); + c2[0] = amd_bytealign (w3[0], w3[1], offset); + c1[3] = amd_bytealign (w2[3], w3[0], offset); + c1[2] = amd_bytealign (w2[2], w2[3], offset); + c1[1] = amd_bytealign (w2[1], w2[2], offset); + c1[0] = amd_bytealign (w2[0], w2[1], offset); + c0[3] = amd_bytealign (w1[3], w2[0], offset); + c0[2] = amd_bytealign (w1[2], w1[3], offset); + c0[1] = amd_bytealign (w1[1], w1[2], offset); + c0[0] = amd_bytealign (w1[0], w1[1], offset); + w3[3] = amd_bytealign (w0[3], w1[0], offset); + w3[2] = amd_bytealign (w0[2], w0[3], offset); + w3[1] = amd_bytealign (w0[1], w0[2], offset); + w3[0] = amd_bytealign (w0[0], w0[1], offset); + w2[3] = amd_bytealign ( 0, w0[0], offset); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 12: + c3[0] = amd_bytealign (w3[3], 0, offset); + c2[3] = amd_bytealign (w3[2], w3[3], offset); + c2[2] = amd_bytealign (w3[1], w3[2], offset); + c2[1] = amd_bytealign (w3[0], w3[1], offset); + c2[0] = amd_bytealign (w2[3], w3[0], offset); + c1[3] = amd_bytealign (w2[2], w2[3], offset); + c1[2] = amd_bytealign (w2[1], w2[2], offset); + c1[1] = amd_bytealign (w2[0], w2[1], offset); + c1[0] = amd_bytealign (w1[3], w2[0], offset); + c0[3] = amd_bytealign (w1[2], w1[3], offset); + c0[2] = amd_bytealign (w1[1], w1[2], offset); + c0[1] = amd_bytealign (w1[0], w1[1], offset); + c0[0] = amd_bytealign (w0[3], w1[0], offset); + w3[3] = amd_bytealign (w0[2], w0[3], offset); + w3[2] = amd_bytealign (w0[1], w0[2], offset); + w3[1] = amd_bytealign (w0[0], w0[1], offset); + w3[0] = amd_bytealign ( 0, w0[0], offset); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 13: + c3[1] = amd_bytealign (w3[3], 0, offset); + c3[0] = amd_bytealign (w3[2], w3[3], offset); + c2[3] = amd_bytealign (w3[1], w3[2], offset); + c2[2] = amd_bytealign (w3[0], w3[1], offset); + c2[1] = amd_bytealign (w2[3], w3[0], offset); + c2[0] = amd_bytealign (w2[2], w2[3], offset); + c1[3] = amd_bytealign (w2[1], w2[2], offset); + c1[2] = amd_bytealign (w2[0], w2[1], offset); + c1[1] = amd_bytealign (w1[3], w2[0], offset); + c1[0] = amd_bytealign (w1[2], w1[3], offset); + c0[3] = amd_bytealign (w1[1], w1[2], offset); + c0[2] = amd_bytealign (w1[0], w1[1], offset); + c0[1] = amd_bytealign (w0[3], w1[0], offset); + c0[0] = amd_bytealign (w0[2], w0[3], offset); + w3[3] = amd_bytealign (w0[1], w0[2], offset); + w3[2] = amd_bytealign (w0[0], w0[1], offset); + w3[1] = amd_bytealign ( 0, w0[0], offset); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 14: + c3[2] = amd_bytealign (w3[3], 0, offset); + c3[1] = amd_bytealign (w3[2], w3[3], offset); + c3[0] = amd_bytealign (w3[1], w3[2], offset); + c2[3] = amd_bytealign (w3[0], w3[1], offset); + c2[2] = amd_bytealign (w2[3], w3[0], offset); + c2[1] = amd_bytealign (w2[2], w2[3], offset); + c2[0] = amd_bytealign (w2[1], w2[2], offset); + c1[3] = amd_bytealign (w2[0], w2[1], offset); + c1[2] = amd_bytealign (w1[3], w2[0], offset); + c1[1] = amd_bytealign (w1[2], w1[3], offset); + c1[0] = amd_bytealign (w1[1], w1[2], offset); + c0[3] = amd_bytealign (w1[0], w1[1], offset); + c0[2] = amd_bytealign (w0[3], w1[0], offset); + c0[1] = amd_bytealign (w0[2], w0[3], offset); + c0[0] = amd_bytealign (w0[1], w0[2], offset); + w3[3] = amd_bytealign (w0[0], w0[1], offset); + w3[2] = amd_bytealign ( 0, w0[0], offset); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 15: + c3[3] = amd_bytealign (w3[3], 0, offset); + c3[2] = amd_bytealign (w3[2], w3[3], offset); + c3[1] = amd_bytealign (w3[1], w3[2], offset); + c3[0] = amd_bytealign (w3[0], w3[1], offset); + c2[3] = amd_bytealign (w2[3], w3[0], offset); + c2[2] = amd_bytealign (w2[2], w2[3], offset); + c2[1] = amd_bytealign (w2[1], w2[2], offset); + c2[0] = amd_bytealign (w2[0], w2[1], offset); + c1[3] = amd_bytealign (w1[3], w2[0], offset); + c1[2] = amd_bytealign (w1[2], w1[3], offset); + c1[1] = amd_bytealign (w1[1], w1[2], offset); + c1[0] = amd_bytealign (w1[0], w1[1], offset); + c0[3] = amd_bytealign (w0[3], w1[0], offset); + c0[2] = amd_bytealign (w0[2], w0[3], offset); + c0[1] = amd_bytealign (w0[1], w0[2], offset); + c0[0] = amd_bytealign (w0[0], w0[1], offset); + w3[3] = amd_bytealign ( 0, w0[0], offset); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + } + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + c0[0] = swap32 (c0[0]); + c0[1] = swap32 (c0[1]); + c0[2] = swap32 (c0[2]); + c0[3] = swap32 (c0[3]); + c1[0] = swap32 (c1[0]); + c1[1] = swap32 (c1[1]); + c1[2] = swap32 (c1[2]); + c1[3] = swap32 (c1[3]); + c2[0] = swap32 (c2[0]); + c2[1] = swap32 (c2[1]); + c2[2] = swap32 (c2[2]); + c2[3] = swap32 (c2[3]); + c3[0] = swap32 (c3[0]); + c3[1] = swap32 (c3[1]); + c3[2] = swap32 (c3[2]); + c3[3] = swap32 (c3[3]); + #endif + + #ifdef IS_NV + // todo switch (offset / 4) { case 0: @@ -4596,6 +4943,7 @@ void switch_buffer_by_offset_carry_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x break; } + #endif } void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) @@ -6187,452 +6535,284 @@ void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3 const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + w4[0] = swap32 (w4[0]); + w4[1] = swap32 (w4[1]); + w4[2] = swap32 (w4[2]); + w4[3] = swap32 (w4[3]); + w5[0] = swap32 (w5[0]); + w5[1] = swap32 (w5[1]); + w5[2] = swap32 (w5[2]); + w5[3] = swap32 (w5[3]); + w6[0] = swap32 (w6[0]); + w6[1] = swap32 (w6[1]); + w6[2] = swap32 (w6[2]); + w6[3] = swap32 (w6[3]); + w7[0] = swap32 (w7[0]); + w7[1] = swap32 (w7[1]); + w7[2] = swap32 (w7[2]); + w7[3] = swap32 (w7[3]); + switch (offset / 4) { - case 0: - w7[3] = amd_bytealign (w7[3], w7[2], offset_minus_4); - w7[2] = amd_bytealign (w7[2], w7[1], offset_minus_4); - w7[1] = amd_bytealign (w7[1], w7[0], offset_minus_4); - w7[0] = amd_bytealign (w7[0], w6[3], offset_minus_4); - w6[3] = amd_bytealign (w6[3], w6[2], offset_minus_4); - w6[2] = amd_bytealign (w6[2], w6[1], offset_minus_4); - w6[1] = amd_bytealign (w6[1], w6[0], offset_minus_4); - w6[0] = amd_bytealign (w6[0], w5[3], offset_minus_4); - w5[3] = amd_bytealign (w5[3], w5[2], offset_minus_4); - w5[2] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w5[1] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w5[0] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w4[3] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w4[2] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w4[1] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w4[0] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w3[3] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w3[2] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w3[1] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w3[0] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w2[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w2[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w2[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w1[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w1[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w1[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w0[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w0[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w0[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[0] = amd_bytealign (w0[0], 0, offset_minus_4); - - if (offset_mod_4 == 0) - { - w0[0] = w0[1]; - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } + case 0: + w7[3] = amd_bytealign (w7[2], w7[3], offset); + w7[2] = amd_bytealign (w7[1], w7[2], offset); + w7[1] = amd_bytealign (w7[0], w7[1], offset); + w7[0] = amd_bytealign (w6[3], w7[0], offset); + w6[3] = amd_bytealign (w6[2], w6[3], offset); + w6[2] = amd_bytealign (w6[1], w6[2], offset); + w6[1] = amd_bytealign (w6[0], w6[1], offset); + w6[0] = amd_bytealign (w5[3], w6[0], offset); + w5[3] = amd_bytealign (w5[2], w5[3], offset); + w5[2] = amd_bytealign (w5[1], w5[2], offset); + w5[1] = amd_bytealign (w5[0], w5[1], offset); + w5[0] = amd_bytealign (w4[3], w5[0], offset); + w4[3] = amd_bytealign (w4[2], w4[3], offset); + w4[2] = amd_bytealign (w4[1], w4[2], offset); + w4[1] = amd_bytealign (w4[0], w4[1], offset); + w4[0] = amd_bytealign (w3[3], w4[0], offset); + w3[3] = amd_bytealign (w3[2], w3[3], offset); + w3[2] = amd_bytealign (w3[1], w3[2], offset); + w3[1] = amd_bytealign (w3[0], w3[1], offset); + w3[0] = amd_bytealign (w2[3], w3[0], offset); + w2[3] = amd_bytealign (w2[2], w2[3], offset); + w2[2] = amd_bytealign (w2[1], w2[2], offset); + w2[1] = amd_bytealign (w2[0], w2[1], offset); + w2[0] = amd_bytealign (w1[3], w2[0], offset); + w1[3] = amd_bytealign (w1[2], w1[3], offset); + w1[2] = amd_bytealign (w1[1], w1[2], offset); + w1[1] = amd_bytealign (w1[0], w1[1], offset); + w1[0] = amd_bytealign (w0[3], w1[0], offset); + w0[3] = amd_bytealign (w0[2], w0[3], offset); + w0[2] = amd_bytealign (w0[1], w0[2], offset); + w0[1] = amd_bytealign (w0[0], w0[1], offset); + w0[0] = amd_bytealign ( 0, w0[0], offset); break; - case 1: - w7[3] = amd_bytealign (w7[2], w7[1], offset_minus_4); - w7[2] = amd_bytealign (w7[1], w7[0], offset_minus_4); - w7[1] = amd_bytealign (w7[0], w6[3], offset_minus_4); - w7[0] = amd_bytealign (w6[3], w6[2], offset_minus_4); - w6[3] = amd_bytealign (w6[2], w6[1], offset_minus_4); - w6[2] = amd_bytealign (w6[1], w6[0], offset_minus_4); - w6[1] = amd_bytealign (w6[0], w5[3], offset_minus_4); - w6[0] = amd_bytealign (w5[3], w5[2], offset_minus_4); - w5[3] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w5[2] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w5[1] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w5[0] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w4[3] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w4[2] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w4[1] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w4[0] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w3[3] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w3[2] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w3[1] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w3[0] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w2[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w2[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w1[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w1[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w0[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w0[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[1] = amd_bytealign (w0[0], 0, offset_minus_4); + case 1: + w7[3] = amd_bytealign (w7[1], w7[2], offset); + w7[2] = amd_bytealign (w7[0], w7[1], offset); + w7[1] = amd_bytealign (w6[3], w7[0], offset); + w7[0] = amd_bytealign (w6[2], w6[3], offset); + w6[3] = amd_bytealign (w6[1], w6[2], offset); + w6[2] = amd_bytealign (w6[0], w6[1], offset); + w6[1] = amd_bytealign (w5[3], w6[0], offset); + w6[0] = amd_bytealign (w5[2], w5[3], offset); + w5[3] = amd_bytealign (w5[1], w5[2], offset); + w5[2] = amd_bytealign (w5[0], w5[1], offset); + w5[1] = amd_bytealign (w4[3], w5[0], offset); + w5[0] = amd_bytealign (w4[2], w4[3], offset); + w4[3] = amd_bytealign (w4[1], w4[2], offset); + w4[2] = amd_bytealign (w4[0], w4[1], offset); + w4[1] = amd_bytealign (w3[3], w4[0], offset); + w4[0] = amd_bytealign (w3[2], w3[3], offset); + w3[3] = amd_bytealign (w3[1], w3[2], offset); + w3[2] = amd_bytealign (w3[0], w3[1], offset); + w3[1] = amd_bytealign (w2[3], w3[0], offset); + w3[0] = amd_bytealign (w2[2], w2[3], offset); + w2[3] = amd_bytealign (w2[1], w2[2], offset); + w2[2] = amd_bytealign (w2[0], w2[1], offset); + w2[1] = amd_bytealign (w1[3], w2[0], offset); + w2[0] = amd_bytealign (w1[2], w1[3], offset); + w1[3] = amd_bytealign (w1[1], w1[2], offset); + w1[2] = amd_bytealign (w1[0], w1[1], offset); + w1[1] = amd_bytealign (w0[3], w1[0], offset); + w1[0] = amd_bytealign (w0[2], w0[3], offset); + w0[3] = amd_bytealign (w0[1], w0[2], offset); + w0[2] = amd_bytealign (w0[0], w0[1], offset); + w0[1] = amd_bytealign ( 0, w0[0], offset); w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 2: - w7[3] = amd_bytealign (w7[1], w7[0], offset_minus_4); - w7[2] = amd_bytealign (w7[0], w6[3], offset_minus_4); - w7[1] = amd_bytealign (w6[3], w6[2], offset_minus_4); - w7[0] = amd_bytealign (w6[2], w6[1], offset_minus_4); - w6[3] = amd_bytealign (w6[1], w6[0], offset_minus_4); - w6[2] = amd_bytealign (w6[0], w5[3], offset_minus_4); - w6[1] = amd_bytealign (w5[3], w5[2], offset_minus_4); - w6[0] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w5[3] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w5[2] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w5[1] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w5[0] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w4[3] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w4[2] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w4[1] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w4[0] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w3[3] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w3[2] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w3[1] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w3[0] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w2[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w1[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w0[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[2] = amd_bytealign (w0[0], 0, offset_minus_4); + case 2: + w7[3] = amd_bytealign (w7[0], w7[1], offset); + w7[2] = amd_bytealign (w6[3], w7[0], offset); + w7[1] = amd_bytealign (w6[2], w6[3], offset); + w7[0] = amd_bytealign (w6[1], w6[2], offset); + w6[3] = amd_bytealign (w6[0], w6[1], offset); + w6[2] = amd_bytealign (w5[3], w6[0], offset); + w6[1] = amd_bytealign (w5[2], w5[3], offset); + w6[0] = amd_bytealign (w5[1], w5[2], offset); + w5[3] = amd_bytealign (w5[0], w5[1], offset); + w5[2] = amd_bytealign (w4[3], w5[0], offset); + w5[1] = amd_bytealign (w4[2], w4[3], offset); + w5[0] = amd_bytealign (w4[1], w4[2], offset); + w4[3] = amd_bytealign (w4[0], w4[1], offset); + w4[2] = amd_bytealign (w3[3], w4[0], offset); + w4[1] = amd_bytealign (w3[2], w3[3], offset); + w4[0] = amd_bytealign (w3[1], w3[2], offset); + w3[3] = amd_bytealign (w3[0], w3[1], offset); + w3[2] = amd_bytealign (w2[3], w3[0], offset); + w3[1] = amd_bytealign (w2[2], w2[3], offset); + w3[0] = amd_bytealign (w2[1], w2[2], offset); + w2[3] = amd_bytealign (w2[0], w2[1], offset); + w2[2] = amd_bytealign (w1[3], w2[0], offset); + w2[1] = amd_bytealign (w1[2], w1[3], offset); + w2[0] = amd_bytealign (w1[1], w1[2], offset); + w1[3] = amd_bytealign (w1[0], w1[1], offset); + w1[2] = amd_bytealign (w0[3], w1[0], offset); + w1[1] = amd_bytealign (w0[2], w0[3], offset); + w1[0] = amd_bytealign (w0[1], w0[2], offset); + w0[3] = amd_bytealign (w0[0], w0[1], offset); + w0[2] = amd_bytealign ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 3: - w7[3] = amd_bytealign (w7[0], w6[3], offset_minus_4); - w7[2] = amd_bytealign (w6[3], w6[2], offset_minus_4); - w7[1] = amd_bytealign (w6[2], w6[1], offset_minus_4); - w7[0] = amd_bytealign (w6[1], w6[0], offset_minus_4); - w6[3] = amd_bytealign (w6[0], w5[3], offset_minus_4); - w6[2] = amd_bytealign (w5[3], w5[2], offset_minus_4); - w6[1] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w6[0] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w5[3] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w5[2] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w5[1] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w5[0] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w4[3] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w4[2] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w4[1] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w4[0] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w3[3] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w3[2] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w3[1] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w3[0] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[3] = amd_bytealign (w0[0], 0, offset_minus_4); + case 3: + w7[3] = amd_bytealign (w6[3], w7[0], offset); + w7[2] = amd_bytealign (w6[2], w6[3], offset); + w7[1] = amd_bytealign (w6[1], w6[2], offset); + w7[0] = amd_bytealign (w6[0], w6[1], offset); + w6[3] = amd_bytealign (w5[3], w6[0], offset); + w6[2] = amd_bytealign (w5[2], w5[3], offset); + w6[1] = amd_bytealign (w5[1], w5[2], offset); + w6[0] = amd_bytealign (w5[0], w5[1], offset); + w5[3] = amd_bytealign (w4[3], w5[0], offset); + w5[2] = amd_bytealign (w4[2], w4[3], offset); + w5[1] = amd_bytealign (w4[1], w4[2], offset); + w5[0] = amd_bytealign (w4[0], w4[1], offset); + w4[3] = amd_bytealign (w3[3], w4[0], offset); + w4[2] = amd_bytealign (w3[2], w3[3], offset); + w4[1] = amd_bytealign (w3[1], w3[2], offset); + w4[0] = amd_bytealign (w3[0], w3[1], offset); + w3[3] = amd_bytealign (w2[3], w3[0], offset); + w3[2] = amd_bytealign (w2[2], w2[3], offset); + w3[1] = amd_bytealign (w2[1], w2[2], offset); + w3[0] = amd_bytealign (w2[0], w2[1], offset); + w2[3] = amd_bytealign (w1[3], w2[0], offset); + w2[2] = amd_bytealign (w1[2], w1[3], offset); + w2[1] = amd_bytealign (w1[1], w1[2], offset); + w2[0] = amd_bytealign (w1[0], w1[1], offset); + w1[3] = amd_bytealign (w0[3], w1[0], offset); + w1[2] = amd_bytealign (w0[2], w0[3], offset); + w1[1] = amd_bytealign (w0[1], w0[2], offset); + w1[0] = amd_bytealign (w0[0], w0[1], offset); + w0[3] = amd_bytealign ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 4: - w7[3] = amd_bytealign (w6[3], w6[2], offset_minus_4); - w7[2] = amd_bytealign (w6[2], w6[1], offset_minus_4); - w7[1] = amd_bytealign (w6[1], w6[0], offset_minus_4); - w7[0] = amd_bytealign (w6[0], w5[3], offset_minus_4); - w6[3] = amd_bytealign (w5[3], w5[2], offset_minus_4); - w6[2] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w6[1] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w6[0] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w5[3] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w5[2] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w5[1] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w5[0] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w4[3] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w4[2] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w4[1] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w4[0] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w3[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w3[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w3[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w3[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[0] = amd_bytealign (w0[0], 0, offset_minus_4); + case 4: + w7[3] = amd_bytealign (w6[2], w6[3], offset); + w7[2] = amd_bytealign (w6[1], w6[2], offset); + w7[1] = amd_bytealign (w6[0], w6[1], offset); + w7[0] = amd_bytealign (w5[3], w6[0], offset); + w6[3] = amd_bytealign (w5[2], w5[3], offset); + w6[2] = amd_bytealign (w5[1], w5[2], offset); + w6[1] = amd_bytealign (w5[0], w5[1], offset); + w6[0] = amd_bytealign (w4[3], w5[0], offset); + w5[3] = amd_bytealign (w4[2], w4[3], offset); + w5[2] = amd_bytealign (w4[1], w4[2], offset); + w5[1] = amd_bytealign (w4[0], w4[1], offset); + w5[0] = amd_bytealign (w3[3], w4[0], offset); + w4[3] = amd_bytealign (w3[2], w3[3], offset); + w4[2] = amd_bytealign (w3[1], w3[2], offset); + w4[1] = amd_bytealign (w3[0], w3[1], offset); + w4[0] = amd_bytealign (w2[3], w3[0], offset); + w3[3] = amd_bytealign (w2[2], w2[3], offset); + w3[2] = amd_bytealign (w2[1], w2[2], offset); + w3[1] = amd_bytealign (w2[0], w2[1], offset); + w3[0] = amd_bytealign (w1[3], w2[0], offset); + w2[3] = amd_bytealign (w1[2], w1[3], offset); + w2[2] = amd_bytealign (w1[1], w1[2], offset); + w2[1] = amd_bytealign (w1[0], w1[1], offset); + w2[0] = amd_bytealign (w0[3], w1[0], offset); + w1[3] = amd_bytealign (w0[2], w0[3], offset); + w1[2] = amd_bytealign (w0[1], w0[2], offset); + w1[1] = amd_bytealign (w0[0], w0[1], offset); + w1[0] = amd_bytealign ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 5: - w7[3] = amd_bytealign (w6[2], w6[1], offset_minus_4); - w7[2] = amd_bytealign (w6[1], w6[0], offset_minus_4); - w7[1] = amd_bytealign (w6[0], w5[3], offset_minus_4); - w7[0] = amd_bytealign (w5[3], w5[2], offset_minus_4); - w6[3] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w6[2] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w6[1] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w6[0] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w5[3] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w5[2] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w5[1] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w5[0] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w4[3] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w4[2] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w4[1] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w4[0] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w3[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w3[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w3[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w3[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[1] = amd_bytealign (w0[0], 0, offset_minus_4); + case 5: + w7[3] = amd_bytealign (w6[1], w6[2], offset); + w7[2] = amd_bytealign (w6[0], w6[1], offset); + w7[1] = amd_bytealign (w5[3], w6[0], offset); + w7[0] = amd_bytealign (w5[2], w5[3], offset); + w6[3] = amd_bytealign (w5[1], w5[2], offset); + w6[2] = amd_bytealign (w5[0], w5[1], offset); + w6[1] = amd_bytealign (w4[3], w5[0], offset); + w6[0] = amd_bytealign (w4[2], w4[3], offset); + w5[3] = amd_bytealign (w4[1], w4[2], offset); + w5[2] = amd_bytealign (w4[0], w4[1], offset); + w5[1] = amd_bytealign (w3[3], w4[0], offset); + w5[0] = amd_bytealign (w3[2], w3[3], offset); + w4[3] = amd_bytealign (w3[1], w3[2], offset); + w4[2] = amd_bytealign (w3[0], w3[1], offset); + w4[1] = amd_bytealign (w2[3], w3[0], offset); + w4[0] = amd_bytealign (w2[2], w2[3], offset); + w3[3] = amd_bytealign (w2[1], w2[2], offset); + w3[2] = amd_bytealign (w2[0], w2[1], offset); + w3[1] = amd_bytealign (w1[3], w2[0], offset); + w3[0] = amd_bytealign (w1[2], w1[3], offset); + w2[3] = amd_bytealign (w1[1], w1[2], offset); + w2[2] = amd_bytealign (w1[0], w1[1], offset); + w2[1] = amd_bytealign (w0[3], w1[0], offset); + w2[0] = amd_bytealign (w0[2], w0[3], offset); + w1[3] = amd_bytealign (w0[1], w0[2], offset); + w1[2] = amd_bytealign (w0[0], w0[1], offset); + w1[1] = amd_bytealign ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 6: - w7[3] = amd_bytealign (w6[1], w6[0], offset_minus_4); - w7[2] = amd_bytealign (w6[0], w5[3], offset_minus_4); - w7[1] = amd_bytealign (w5[3], w5[2], offset_minus_4); - w7[0] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w6[3] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w6[2] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w6[1] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w6[0] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w5[3] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w5[2] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w5[1] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w5[0] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w4[3] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w4[2] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w4[1] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w4[0] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w3[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w3[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w3[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w3[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[2] = amd_bytealign (w0[0], 0, offset_minus_4); + case 6: + w7[3] = amd_bytealign (w6[0], w6[1], offset); + w7[2] = amd_bytealign (w5[3], w6[0], offset); + w7[1] = amd_bytealign (w5[2], w5[3], offset); + w7[0] = amd_bytealign (w5[1], w5[2], offset); + w6[3] = amd_bytealign (w5[0], w5[1], offset); + w6[2] = amd_bytealign (w4[3], w5[0], offset); + w6[1] = amd_bytealign (w4[2], w4[3], offset); + w6[0] = amd_bytealign (w4[1], w4[2], offset); + w5[3] = amd_bytealign (w4[0], w4[1], offset); + w5[2] = amd_bytealign (w3[3], w4[0], offset); + w5[1] = amd_bytealign (w3[2], w3[3], offset); + w5[0] = amd_bytealign (w3[1], w3[2], offset); + w4[3] = amd_bytealign (w3[0], w3[1], offset); + w4[2] = amd_bytealign (w2[3], w3[0], offset); + w4[1] = amd_bytealign (w2[2], w2[3], offset); + w4[0] = amd_bytealign (w2[1], w2[2], offset); + w3[3] = amd_bytealign (w2[0], w2[1], offset); + w3[2] = amd_bytealign (w1[3], w2[0], offset); + w3[1] = amd_bytealign (w1[2], w1[3], offset); + w3[0] = amd_bytealign (w1[1], w1[2], offset); + w2[3] = amd_bytealign (w1[0], w1[1], offset); + w2[2] = amd_bytealign (w0[3], w1[0], offset); + w2[1] = amd_bytealign (w0[2], w0[3], offset); + w2[0] = amd_bytealign (w0[1], w0[2], offset); + w1[3] = amd_bytealign (w0[0], w0[1], offset); + w1[2] = amd_bytealign ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -6640,64 +6820,34 @@ void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 7: - w7[3] = amd_bytealign (w6[0], w5[3], offset_minus_4); - w7[2] = amd_bytealign (w5[3], w5[2], offset_minus_4); - w7[1] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w7[0] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w6[3] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w6[2] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w6[1] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w6[0] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w5[3] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w5[2] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w5[1] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w5[0] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w4[3] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w4[2] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w4[1] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w4[0] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w3[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w3[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w3[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w3[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[3] = amd_bytealign (w0[0], 0, offset_minus_4); + case 7: + w7[3] = amd_bytealign (w5[3], w6[0], offset); + w7[2] = amd_bytealign (w5[2], w5[3], offset); + w7[1] = amd_bytealign (w5[1], w5[2], offset); + w7[0] = amd_bytealign (w5[0], w5[1], offset); + w6[3] = amd_bytealign (w4[3], w5[0], offset); + w6[2] = amd_bytealign (w4[2], w4[3], offset); + w6[1] = amd_bytealign (w4[1], w4[2], offset); + w6[0] = amd_bytealign (w4[0], w4[1], offset); + w5[3] = amd_bytealign (w3[3], w4[0], offset); + w5[2] = amd_bytealign (w3[2], w3[3], offset); + w5[1] = amd_bytealign (w3[1], w3[2], offset); + w5[0] = amd_bytealign (w3[0], w3[1], offset); + w4[3] = amd_bytealign (w2[3], w3[0], offset); + w4[2] = amd_bytealign (w2[2], w2[3], offset); + w4[1] = amd_bytealign (w2[1], w2[2], offset); + w4[0] = amd_bytealign (w2[0], w2[1], offset); + w3[3] = amd_bytealign (w1[3], w2[0], offset); + w3[2] = amd_bytealign (w1[2], w1[3], offset); + w3[1] = amd_bytealign (w1[1], w1[2], offset); + w3[0] = amd_bytealign (w1[0], w1[1], offset); + w2[3] = amd_bytealign (w0[3], w1[0], offset); + w2[2] = amd_bytealign (w0[2], w0[3], offset); + w2[1] = amd_bytealign (w0[1], w0[2], offset); + w2[0] = amd_bytealign (w0[0], w0[1], offset); + w1[3] = amd_bytealign ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -6706,62 +6856,33 @@ void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 8: - w7[3] = amd_bytealign (w5[3], w5[2], offset_minus_4); - w7[2] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w7[1] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w7[0] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w6[3] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w6[2] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w6[1] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w6[0] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w5[3] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w5[2] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w5[1] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w5[0] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w4[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w4[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w4[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w4[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w3[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w3[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w3[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w3[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[0] = amd_bytealign (w0[0], 0, offset_minus_4); + case 8: + w7[3] = amd_bytealign (w5[2], w5[3], offset); + w7[2] = amd_bytealign (w5[1], w5[2], offset); + w7[1] = amd_bytealign (w5[0], w5[1], offset); + w7[0] = amd_bytealign (w4[3], w5[0], offset); + w6[3] = amd_bytealign (w4[2], w4[3], offset); + w6[2] = amd_bytealign (w4[1], w4[2], offset); + w6[1] = amd_bytealign (w4[0], w4[1], offset); + w6[0] = amd_bytealign (w3[3], w4[0], offset); + w5[3] = amd_bytealign (w3[2], w3[3], offset); + w5[2] = amd_bytealign (w3[1], w3[2], offset); + w5[1] = amd_bytealign (w3[0], w3[1], offset); + w5[0] = amd_bytealign (w2[3], w3[0], offset); + w4[3] = amd_bytealign (w2[2], w2[3], offset); + w4[2] = amd_bytealign (w2[1], w2[2], offset); + w4[1] = amd_bytealign (w2[0], w2[1], offset); + w4[0] = amd_bytealign (w1[3], w2[0], offset); + w3[3] = amd_bytealign (w1[2], w1[3], offset); + w3[2] = amd_bytealign (w1[1], w1[2], offset); + w3[1] = amd_bytealign (w1[0], w1[1], offset); + w3[0] = amd_bytealign (w0[3], w1[0], offset); + w2[3] = amd_bytealign (w0[2], w0[3], offset); + w2[2] = amd_bytealign (w0[1], w0[2], offset); + w2[1] = amd_bytealign (w0[0], w0[1], offset); + w2[0] = amd_bytealign ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -6771,60 +6892,32 @@ void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 9: - w7[3] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w7[2] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w7[1] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w7[0] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w6[3] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w6[2] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w6[1] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w6[0] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w5[3] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w5[2] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w5[1] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w5[0] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w4[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w4[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w4[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w4[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w3[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w3[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w3[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w3[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[1] = amd_bytealign (w0[0], 0, offset_minus_4); + case 9: + w7[3] = amd_bytealign (w5[1], w5[2], offset); + w7[2] = amd_bytealign (w5[0], w5[1], offset); + w7[1] = amd_bytealign (w4[3], w5[0], offset); + w7[0] = amd_bytealign (w4[2], w4[3], offset); + w6[3] = amd_bytealign (w4[1], w4[2], offset); + w6[2] = amd_bytealign (w4[0], w4[1], offset); + w6[1] = amd_bytealign (w3[3], w4[0], offset); + w6[0] = amd_bytealign (w3[2], w3[3], offset); + w5[3] = amd_bytealign (w3[1], w3[2], offset); + w5[2] = amd_bytealign (w3[0], w3[1], offset); + w5[1] = amd_bytealign (w2[3], w3[0], offset); + w5[0] = amd_bytealign (w2[2], w2[3], offset); + w4[3] = amd_bytealign (w2[1], w2[2], offset); + w4[2] = amd_bytealign (w2[0], w2[1], offset); + w4[1] = amd_bytealign (w1[3], w2[0], offset); + w4[0] = amd_bytealign (w1[2], w1[3], offset); + w3[3] = amd_bytealign (w1[1], w1[2], offset); + w3[2] = amd_bytealign (w1[0], w1[1], offset); + w3[1] = amd_bytealign (w0[3], w1[0], offset); + w3[0] = amd_bytealign (w0[2], w0[3], offset); + w2[3] = amd_bytealign (w0[1], w0[2], offset); + w2[2] = amd_bytealign (w0[0], w0[1], offset); + w2[1] = amd_bytealign ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -6835,58 +6928,31 @@ void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 10: - w7[3] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w7[2] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w7[1] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w7[0] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w6[3] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w6[2] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w6[1] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w6[0] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w5[3] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w5[2] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w5[1] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w5[0] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w4[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w4[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w4[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w4[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w3[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w3[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w3[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w3[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[2] = amd_bytealign (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign (w5[0], w5[1], offset); + w7[2] = amd_bytealign (w4[3], w5[0], offset); + w7[1] = amd_bytealign (w4[2], w4[3], offset); + w7[0] = amd_bytealign (w4[1], w4[2], offset); + w6[3] = amd_bytealign (w4[0], w4[1], offset); + w6[2] = amd_bytealign (w3[3], w4[0], offset); + w6[1] = amd_bytealign (w3[2], w3[3], offset); + w6[0] = amd_bytealign (w3[1], w3[2], offset); + w5[3] = amd_bytealign (w3[0], w3[1], offset); + w5[2] = amd_bytealign (w2[3], w3[0], offset); + w5[1] = amd_bytealign (w2[2], w2[3], offset); + w5[0] = amd_bytealign (w2[1], w2[2], offset); + w4[3] = amd_bytealign (w2[0], w2[1], offset); + w4[2] = amd_bytealign (w1[3], w2[0], offset); + w4[1] = amd_bytealign (w1[2], w1[3], offset); + w4[0] = amd_bytealign (w1[1], w1[2], offset); + w3[3] = amd_bytealign (w1[0], w1[1], offset); + w3[2] = amd_bytealign (w0[3], w1[0], offset); + w3[1] = amd_bytealign (w0[2], w0[3], offset); + w3[0] = amd_bytealign (w0[1], w0[2], offset); + w2[3] = amd_bytealign (w0[0], w0[1], offset); + w2[2] = amd_bytealign ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -6898,56 +6964,30 @@ void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 11: - w7[3] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w7[2] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w7[1] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w7[0] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w6[3] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w6[2] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w6[1] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w6[0] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w5[3] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w5[2] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w5[1] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w5[0] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w4[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w4[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w4[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w4[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w3[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w3[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w3[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w3[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[3] = amd_bytealign (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign (w4[3], w5[0], offset); + w7[2] = amd_bytealign (w4[2], w4[3], offset); + w7[1] = amd_bytealign (w4[1], w4[2], offset); + w7[0] = amd_bytealign (w4[0], w4[1], offset); + w6[3] = amd_bytealign (w3[3], w4[0], offset); + w6[2] = amd_bytealign (w3[2], w3[3], offset); + w6[1] = amd_bytealign (w3[1], w3[2], offset); + w6[0] = amd_bytealign (w3[0], w3[1], offset); + w5[3] = amd_bytealign (w2[3], w3[0], offset); + w5[2] = amd_bytealign (w2[2], w2[3], offset); + w5[1] = amd_bytealign (w2[1], w2[2], offset); + w5[0] = amd_bytealign (w2[0], w2[1], offset); + w4[3] = amd_bytealign (w1[3], w2[0], offset); + w4[2] = amd_bytealign (w1[2], w1[3], offset); + w4[1] = amd_bytealign (w1[1], w1[2], offset); + w4[0] = amd_bytealign (w1[0], w1[1], offset); + w3[3] = amd_bytealign (w0[3], w1[0], offset); + w3[2] = amd_bytealign (w0[2], w0[3], offset); + w3[1] = amd_bytealign (w0[1], w0[2], offset); + w3[0] = amd_bytealign (w0[0], w0[1], offset); + w2[3] = amd_bytealign ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -6960,54 +7000,29 @@ void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 12: - w7[3] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w7[2] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w7[1] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w7[0] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w6[3] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w6[2] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w6[1] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w6[0] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w5[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w5[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w5[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w5[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w4[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w4[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w4[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w4[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w3[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w3[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w3[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w3[0] = amd_bytealign (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign (w4[2], w4[3], offset); + w7[2] = amd_bytealign (w4[1], w4[2], offset); + w7[1] = amd_bytealign (w4[0], w4[1], offset); + w7[0] = amd_bytealign (w3[3], w4[0], offset); + w6[3] = amd_bytealign (w3[2], w3[3], offset); + w6[2] = amd_bytealign (w3[1], w3[2], offset); + w6[1] = amd_bytealign (w3[0], w3[1], offset); + w6[0] = amd_bytealign (w2[3], w3[0], offset); + w5[3] = amd_bytealign (w2[2], w2[3], offset); + w5[2] = amd_bytealign (w2[1], w2[2], offset); + w5[1] = amd_bytealign (w2[0], w2[1], offset); + w5[0] = amd_bytealign (w1[3], w2[0], offset); + w4[3] = amd_bytealign (w1[2], w1[3], offset); + w4[2] = amd_bytealign (w1[1], w1[2], offset); + w4[1] = amd_bytealign (w1[0], w1[1], offset); + w4[0] = amd_bytealign (w0[3], w1[0], offset); + w3[3] = amd_bytealign (w0[2], w0[3], offset); + w3[2] = amd_bytealign (w0[1], w0[2], offset); + w3[1] = amd_bytealign (w0[0], w0[1], offset); + w3[0] = amd_bytealign ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -7021,52 +7036,28 @@ void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 13: - w7[3] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w7[2] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w7[1] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w7[0] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w6[3] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w6[2] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w6[1] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w6[0] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w5[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w5[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w5[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w5[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w4[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w4[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w4[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w4[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w3[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w3[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w3[1] = amd_bytealign (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign (w4[1], w4[2], offset); + w7[2] = amd_bytealign (w4[0], w4[1], offset); + w7[1] = amd_bytealign (w3[3], w4[0], offset); + w7[0] = amd_bytealign (w3[2], w3[3], offset); + w6[3] = amd_bytealign (w3[1], w3[2], offset); + w6[2] = amd_bytealign (w3[0], w3[1], offset); + w6[1] = amd_bytealign (w2[3], w3[0], offset); + w6[0] = amd_bytealign (w2[2], w2[3], offset); + w5[3] = amd_bytealign (w2[1], w2[2], offset); + w5[2] = amd_bytealign (w2[0], w2[1], offset); + w5[1] = amd_bytealign (w1[3], w2[0], offset); + w5[0] = amd_bytealign (w1[2], w1[3], offset); + w4[3] = amd_bytealign (w1[1], w1[2], offset); + w4[2] = amd_bytealign (w1[0], w1[1], offset); + w4[1] = amd_bytealign (w0[3], w1[0], offset); + w4[0] = amd_bytealign (w0[2], w0[3], offset); + w3[3] = amd_bytealign (w0[1], w0[2], offset); + w3[2] = amd_bytealign (w0[0], w0[1], offset); + w3[1] = amd_bytealign ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -7081,50 +7072,27 @@ void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 14: - w7[3] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w7[2] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w7[1] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w7[0] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w6[3] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w6[2] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w6[1] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w6[0] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w5[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w5[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w5[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w5[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w4[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w4[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w4[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w4[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w3[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w3[2] = amd_bytealign (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign (w4[0], w4[1], offset); + w7[2] = amd_bytealign (w3[3], w4[0], offset); + w7[1] = amd_bytealign (w3[2], w3[3], offset); + w7[0] = amd_bytealign (w3[1], w3[2], offset); + w6[3] = amd_bytealign (w3[0], w3[1], offset); + w6[2] = amd_bytealign (w2[3], w3[0], offset); + w6[1] = amd_bytealign (w2[2], w2[3], offset); + w6[0] = amd_bytealign (w2[1], w2[2], offset); + w5[3] = amd_bytealign (w2[0], w2[1], offset); + w5[2] = amd_bytealign (w1[3], w2[0], offset); + w5[1] = amd_bytealign (w1[2], w1[3], offset); + w5[0] = amd_bytealign (w1[1], w1[2], offset); + w4[3] = amd_bytealign (w1[0], w1[1], offset); + w4[2] = amd_bytealign (w0[3], w1[0], offset); + w4[1] = amd_bytealign (w0[2], w0[3], offset); + w4[0] = amd_bytealign (w0[1], w0[2], offset); + w3[3] = amd_bytealign (w0[0], w0[1], offset); + w3[2] = amd_bytealign ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -7140,48 +7108,26 @@ void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 15: - w7[3] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w7[2] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w7[1] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w7[0] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w6[3] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w6[2] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w6[1] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w6[0] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w5[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w5[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w5[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w5[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w4[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w4[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w4[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w4[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w3[3] = amd_bytealign (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign (w3[3], w4[0], offset); + w7[2] = amd_bytealign (w3[2], w3[3], offset); + w7[1] = amd_bytealign (w3[1], w3[2], offset); + w7[0] = amd_bytealign (w3[0], w3[1], offset); + w6[3] = amd_bytealign (w2[3], w3[0], offset); + w6[2] = amd_bytealign (w2[2], w2[3], offset); + w6[1] = amd_bytealign (w2[1], w2[2], offset); + w6[0] = amd_bytealign (w2[0], w2[1], offset); + w5[3] = amd_bytealign (w1[3], w2[0], offset); + w5[2] = amd_bytealign (w1[2], w1[3], offset); + w5[1] = amd_bytealign (w1[1], w1[2], offset); + w5[0] = amd_bytealign (w1[0], w1[1], offset); + w4[3] = amd_bytealign (w0[3], w1[0], offset); + w4[2] = amd_bytealign (w0[2], w0[3], offset); + w4[1] = amd_bytealign (w0[1], w0[2], offset); + w4[0] = amd_bytealign (w0[0], w0[1], offset); + w3[3] = amd_bytealign ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -7198,29 +7144,617 @@ void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } + break; + + case 16: + w7[3] = amd_bytealign (w3[2], w3[3], offset); + w7[2] = amd_bytealign (w3[1], w3[2], offset); + w7[1] = amd_bytealign (w3[0], w3[1], offset); + w7[0] = amd_bytealign (w2[3], w3[0], offset); + w6[3] = amd_bytealign (w2[2], w2[3], offset); + w6[2] = amd_bytealign (w2[1], w2[2], offset); + w6[1] = amd_bytealign (w2[0], w2[1], offset); + w6[0] = amd_bytealign (w1[3], w2[0], offset); + w5[3] = amd_bytealign (w1[2], w1[3], offset); + w5[2] = amd_bytealign (w1[1], w1[2], offset); + w5[1] = amd_bytealign (w1[0], w1[1], offset); + w5[0] = amd_bytealign (w0[3], w1[0], offset); + w4[3] = amd_bytealign (w0[2], w0[3], offset); + w4[2] = amd_bytealign (w0[1], w0[2], offset); + w4[1] = amd_bytealign (w0[0], w0[1], offset); + w4[0] = amd_bytealign ( 0, w0[0], offset); + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 17: + w7[3] = amd_bytealign (w3[1], w3[2], offset); + w7[2] = amd_bytealign (w3[0], w3[1], offset); + w7[1] = amd_bytealign (w2[3], w3[0], offset); + w7[0] = amd_bytealign (w2[2], w2[3], offset); + w6[3] = amd_bytealign (w2[1], w2[2], offset); + w6[2] = amd_bytealign (w2[0], w2[1], offset); + w6[1] = amd_bytealign (w1[3], w2[0], offset); + w6[0] = amd_bytealign (w1[2], w1[3], offset); + w5[3] = amd_bytealign (w1[1], w1[2], offset); + w5[2] = amd_bytealign (w1[0], w1[1], offset); + w5[1] = amd_bytealign (w0[3], w1[0], offset); + w5[0] = amd_bytealign (w0[2], w0[3], offset); + w4[3] = amd_bytealign (w0[1], w0[2], offset); + w4[2] = amd_bytealign (w0[0], w0[1], offset); + w4[1] = amd_bytealign ( 0, w0[0], offset); + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 18: + w7[3] = amd_bytealign (w3[0], w3[1], offset); + w7[2] = amd_bytealign (w2[3], w3[0], offset); + w7[1] = amd_bytealign (w2[2], w2[3], offset); + w7[0] = amd_bytealign (w2[1], w2[2], offset); + w6[3] = amd_bytealign (w2[0], w2[1], offset); + w6[2] = amd_bytealign (w1[3], w2[0], offset); + w6[1] = amd_bytealign (w1[2], w1[3], offset); + w6[0] = amd_bytealign (w1[1], w1[2], offset); + w5[3] = amd_bytealign (w1[0], w1[1], offset); + w5[2] = amd_bytealign (w0[3], w1[0], offset); + w5[1] = amd_bytealign (w0[2], w0[3], offset); + w5[0] = amd_bytealign (w0[1], w0[2], offset); + w4[3] = amd_bytealign (w0[0], w0[1], offset); + w4[2] = amd_bytealign ( 0, w0[0], offset); + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 19: + w7[3] = amd_bytealign (w2[3], w3[0], offset); + w7[2] = amd_bytealign (w2[2], w2[3], offset); + w7[1] = amd_bytealign (w2[1], w2[2], offset); + w7[0] = amd_bytealign (w2[0], w2[1], offset); + w6[3] = amd_bytealign (w1[3], w2[0], offset); + w6[2] = amd_bytealign (w1[2], w1[3], offset); + w6[1] = amd_bytealign (w1[1], w1[2], offset); + w6[0] = amd_bytealign (w1[0], w1[1], offset); + w5[3] = amd_bytealign (w0[3], w1[0], offset); + w5[2] = amd_bytealign (w0[2], w0[3], offset); + w5[1] = amd_bytealign (w0[1], w0[2], offset); + w5[0] = amd_bytealign (w0[0], w0[1], offset); + w4[3] = amd_bytealign ( 0, w0[0], offset); + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 20: + w7[3] = amd_bytealign (w2[2], w2[3], offset); + w7[2] = amd_bytealign (w2[1], w2[2], offset); + w7[1] = amd_bytealign (w2[0], w2[1], offset); + w7[0] = amd_bytealign (w1[3], w2[0], offset); + w6[3] = amd_bytealign (w1[2], w1[3], offset); + w6[2] = amd_bytealign (w1[1], w1[2], offset); + w6[1] = amd_bytealign (w1[0], w1[1], offset); + w6[0] = amd_bytealign (w0[3], w1[0], offset); + w5[3] = amd_bytealign (w0[2], w0[3], offset); + w5[2] = amd_bytealign (w0[1], w0[2], offset); + w5[1] = amd_bytealign (w0[0], w0[1], offset); + w5[0] = amd_bytealign ( 0, w0[0], offset); + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 21: + w7[3] = amd_bytealign (w2[1], w2[2], offset); + w7[2] = amd_bytealign (w2[0], w2[1], offset); + w7[1] = amd_bytealign (w1[3], w2[0], offset); + w7[0] = amd_bytealign (w1[2], w1[3], offset); + w6[3] = amd_bytealign (w1[1], w1[2], offset); + w6[2] = amd_bytealign (w1[0], w1[1], offset); + w6[1] = amd_bytealign (w0[3], w1[0], offset); + w6[0] = amd_bytealign (w0[2], w0[3], offset); + w5[3] = amd_bytealign (w0[1], w0[2], offset); + w5[2] = amd_bytealign (w0[0], w0[1], offset); + w5[1] = amd_bytealign ( 0, w0[0], offset); + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 22: + w7[3] = amd_bytealign (w2[0], w2[1], offset); + w7[2] = amd_bytealign (w1[3], w2[0], offset); + w7[1] = amd_bytealign (w1[2], w1[3], offset); + w7[0] = amd_bytealign (w1[1], w1[2], offset); + w6[3] = amd_bytealign (w1[0], w1[1], offset); + w6[2] = amd_bytealign (w0[3], w1[0], offset); + w6[1] = amd_bytealign (w0[2], w0[3], offset); + w6[0] = amd_bytealign (w0[1], w0[2], offset); + w5[3] = amd_bytealign (w0[0], w0[1], offset); + w5[2] = amd_bytealign ( 0, w0[0], offset); + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 23: + w7[3] = amd_bytealign (w1[3], w2[0], offset); + w7[2] = amd_bytealign (w1[2], w1[3], offset); + w7[1] = amd_bytealign (w1[1], w1[2], offset); + w7[0] = amd_bytealign (w1[0], w1[1], offset); + w6[3] = amd_bytealign (w0[3], w1[0], offset); + w6[2] = amd_bytealign (w0[2], w0[3], offset); + w6[1] = amd_bytealign (w0[1], w0[2], offset); + w6[0] = amd_bytealign (w0[0], w0[1], offset); + w5[3] = amd_bytealign ( 0, w0[0], offset); + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 24: + w7[3] = amd_bytealign (w1[2], w1[3], offset); + w7[2] = amd_bytealign (w1[1], w1[2], offset); + w7[1] = amd_bytealign (w1[0], w1[1], offset); + w7[0] = amd_bytealign (w0[3], w1[0], offset); + w6[3] = amd_bytealign (w0[2], w0[3], offset); + w6[2] = amd_bytealign (w0[1], w0[2], offset); + w6[1] = amd_bytealign (w0[0], w0[1], offset); + w6[0] = amd_bytealign ( 0, w0[0], offset); + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 25: + w7[3] = amd_bytealign (w1[1], w1[2], offset); + w7[2] = amd_bytealign (w1[0], w1[1], offset); + w7[1] = amd_bytealign (w0[3], w1[0], offset); + w7[0] = amd_bytealign (w0[2], w0[3], offset); + w6[3] = amd_bytealign (w0[1], w0[2], offset); + w6[2] = amd_bytealign (w0[0], w0[1], offset); + w6[1] = amd_bytealign ( 0, w0[0], offset); + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 26: + w7[3] = amd_bytealign (w1[0], w1[1], offset); + w7[2] = amd_bytealign (w0[3], w1[0], offset); + w7[1] = amd_bytealign (w0[2], w0[3], offset); + w7[0] = amd_bytealign (w0[1], w0[2], offset); + w6[3] = amd_bytealign (w0[0], w0[1], offset); + w6[2] = amd_bytealign ( 0, w0[0], offset); + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 27: + w7[3] = amd_bytealign (w0[3], w1[0], offset); + w7[2] = amd_bytealign (w0[2], w0[3], offset); + w7[1] = amd_bytealign (w0[1], w0[2], offset); + w7[0] = amd_bytealign (w0[0], w0[1], offset); + w6[3] = amd_bytealign ( 0, w0[0], offset); + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 28: + w7[3] = amd_bytealign (w0[2], w0[3], offset); + w7[2] = amd_bytealign (w0[1], w0[2], offset); + w7[1] = amd_bytealign (w0[0], w0[1], offset); + w7[0] = amd_bytealign ( 0, w0[0], offset); + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 29: + w7[3] = amd_bytealign (w0[1], w0[2], offset); + w7[2] = amd_bytealign (w0[0], w0[1], offset); + w7[1] = amd_bytealign ( 0, w0[0], offset); + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 30: + w7[3] = amd_bytealign (w0[0], w0[1], offset); + w7[2] = amd_bytealign ( 0, w0[0], offset); + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 31: + w7[3] = amd_bytealign ( 0, w0[0], offset); + w7[2] = 0; + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; } + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + w4[0] = swap32 (w4[0]); + w4[1] = swap32 (w4[1]); + w4[2] = swap32 (w4[2]); + w4[3] = swap32 (w4[3]); + w5[0] = swap32 (w5[0]); + w5[1] = swap32 (w5[1]); + w5[2] = swap32 (w5[2]); + w5[3] = swap32 (w5[3]); + w6[0] = swap32 (w6[0]); + w6[1] = swap32 (w6[1]); + w6[2] = swap32 (w6[2]); + w6[3] = swap32 (w6[3]); + w7[0] = swap32 (w7[0]); + w7[1] = swap32 (w7[1]); + w7[2] = swap32 (w7[2]); + w7[3] = swap32 (w7[3]); #endif #ifdef IS_NV @@ -13485,6 +14019,17456 @@ void switch_buffer_by_offset_8x4_carry_be (u32x w0[4], u32x w1[4], u32x w2[4], u #endif } +void switch_buffer_by_offset_1x64_le (u32x w[64], const u32 offset) +{ + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + + #if defined IS_AMD || defined IS_GENERIC + + #pragma unroll + for (int i = 0; i < 64; i++) w[i] = swap32 (w[i]); + + switch (offset / 4) + { + case 0: + w[63] = amd_bytealign (w[62], w[63], offset); + w[62] = amd_bytealign (w[61], w[62], offset); + w[61] = amd_bytealign (w[60], w[61], offset); + w[60] = amd_bytealign (w[59], w[60], offset); + w[59] = amd_bytealign (w[58], w[59], offset); + w[58] = amd_bytealign (w[57], w[58], offset); + w[57] = amd_bytealign (w[56], w[57], offset); + w[56] = amd_bytealign (w[55], w[56], offset); + w[55] = amd_bytealign (w[54], w[55], offset); + w[54] = amd_bytealign (w[53], w[54], offset); + w[53] = amd_bytealign (w[52], w[53], offset); + w[52] = amd_bytealign (w[51], w[52], offset); + w[51] = amd_bytealign (w[50], w[51], offset); + w[50] = amd_bytealign (w[49], w[50], offset); + w[49] = amd_bytealign (w[48], w[49], offset); + w[48] = amd_bytealign (w[47], w[48], offset); + w[47] = amd_bytealign (w[46], w[47], offset); + w[46] = amd_bytealign (w[45], w[46], offset); + w[45] = amd_bytealign (w[44], w[45], offset); + w[44] = amd_bytealign (w[43], w[44], offset); + w[43] = amd_bytealign (w[42], w[43], offset); + w[42] = amd_bytealign (w[41], w[42], offset); + w[41] = amd_bytealign (w[40], w[41], offset); + w[40] = amd_bytealign (w[39], w[40], offset); + w[39] = amd_bytealign (w[38], w[39], offset); + w[38] = amd_bytealign (w[37], w[38], offset); + w[37] = amd_bytealign (w[36], w[37], offset); + w[36] = amd_bytealign (w[35], w[36], offset); + w[35] = amd_bytealign (w[34], w[35], offset); + w[34] = amd_bytealign (w[33], w[34], offset); + w[33] = amd_bytealign (w[32], w[33], offset); + w[32] = amd_bytealign (w[31], w[32], offset); + w[31] = amd_bytealign (w[30], w[31], offset); + w[30] = amd_bytealign (w[29], w[30], offset); + w[29] = amd_bytealign (w[28], w[29], offset); + w[28] = amd_bytealign (w[27], w[28], offset); + w[27] = amd_bytealign (w[26], w[27], offset); + w[26] = amd_bytealign (w[25], w[26], offset); + w[25] = amd_bytealign (w[24], w[25], offset); + w[24] = amd_bytealign (w[23], w[24], offset); + w[23] = amd_bytealign (w[22], w[23], offset); + w[22] = amd_bytealign (w[21], w[22], offset); + w[21] = amd_bytealign (w[20], w[21], offset); + w[20] = amd_bytealign (w[19], w[20], offset); + w[19] = amd_bytealign (w[18], w[19], offset); + w[18] = amd_bytealign (w[17], w[18], offset); + w[17] = amd_bytealign (w[16], w[17], offset); + w[16] = amd_bytealign (w[15], w[16], offset); + w[15] = amd_bytealign (w[14], w[15], offset); + w[14] = amd_bytealign (w[13], w[14], offset); + w[13] = amd_bytealign (w[12], w[13], offset); + w[12] = amd_bytealign (w[11], w[12], offset); + w[11] = amd_bytealign (w[10], w[11], offset); + w[10] = amd_bytealign (w[ 9], w[10], offset); + w[ 9] = amd_bytealign (w[ 8], w[ 9], offset); + w[ 8] = amd_bytealign (w[ 7], w[ 8], offset); + w[ 7] = amd_bytealign (w[ 6], w[ 7], offset); + w[ 6] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 5] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 4] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 3] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 2] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 1] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 0] = amd_bytealign ( 0, w[ 0], offset); + + break; + + case 1: + w[63] = amd_bytealign (w[61], w[62], offset); + w[62] = amd_bytealign (w[60], w[61], offset); + w[61] = amd_bytealign (w[59], w[60], offset); + w[60] = amd_bytealign (w[58], w[59], offset); + w[59] = amd_bytealign (w[57], w[58], offset); + w[58] = amd_bytealign (w[56], w[57], offset); + w[57] = amd_bytealign (w[55], w[56], offset); + w[56] = amd_bytealign (w[54], w[55], offset); + w[55] = amd_bytealign (w[53], w[54], offset); + w[54] = amd_bytealign (w[52], w[53], offset); + w[53] = amd_bytealign (w[51], w[52], offset); + w[52] = amd_bytealign (w[50], w[51], offset); + w[51] = amd_bytealign (w[49], w[50], offset); + w[50] = amd_bytealign (w[48], w[49], offset); + w[49] = amd_bytealign (w[47], w[48], offset); + w[48] = amd_bytealign (w[46], w[47], offset); + w[47] = amd_bytealign (w[45], w[46], offset); + w[46] = amd_bytealign (w[44], w[45], offset); + w[45] = amd_bytealign (w[43], w[44], offset); + w[44] = amd_bytealign (w[42], w[43], offset); + w[43] = amd_bytealign (w[41], w[42], offset); + w[42] = amd_bytealign (w[40], w[41], offset); + w[41] = amd_bytealign (w[39], w[40], offset); + w[40] = amd_bytealign (w[38], w[39], offset); + w[39] = amd_bytealign (w[37], w[38], offset); + w[38] = amd_bytealign (w[36], w[37], offset); + w[37] = amd_bytealign (w[35], w[36], offset); + w[36] = amd_bytealign (w[34], w[35], offset); + w[35] = amd_bytealign (w[33], w[34], offset); + w[34] = amd_bytealign (w[32], w[33], offset); + w[33] = amd_bytealign (w[31], w[32], offset); + w[32] = amd_bytealign (w[30], w[31], offset); + w[31] = amd_bytealign (w[29], w[30], offset); + w[30] = amd_bytealign (w[28], w[29], offset); + w[29] = amd_bytealign (w[27], w[28], offset); + w[28] = amd_bytealign (w[26], w[27], offset); + w[27] = amd_bytealign (w[25], w[26], offset); + w[26] = amd_bytealign (w[24], w[25], offset); + w[25] = amd_bytealign (w[23], w[24], offset); + w[24] = amd_bytealign (w[22], w[23], offset); + w[23] = amd_bytealign (w[21], w[22], offset); + w[22] = amd_bytealign (w[20], w[21], offset); + w[21] = amd_bytealign (w[19], w[20], offset); + w[20] = amd_bytealign (w[18], w[19], offset); + w[19] = amd_bytealign (w[17], w[18], offset); + w[18] = amd_bytealign (w[16], w[17], offset); + w[17] = amd_bytealign (w[15], w[16], offset); + w[16] = amd_bytealign (w[14], w[15], offset); + w[15] = amd_bytealign (w[13], w[14], offset); + w[14] = amd_bytealign (w[12], w[13], offset); + w[13] = amd_bytealign (w[11], w[12], offset); + w[12] = amd_bytealign (w[10], w[11], offset); + w[11] = amd_bytealign (w[ 9], w[10], offset); + w[10] = amd_bytealign (w[ 8], w[ 9], offset); + w[ 9] = amd_bytealign (w[ 7], w[ 8], offset); + w[ 8] = amd_bytealign (w[ 6], w[ 7], offset); + w[ 7] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 6] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 5] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 4] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 3] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 2] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 1] = amd_bytealign ( 0, w[ 0], offset); + w[ 0] = 0; + + break; + + case 2: + w[63] = amd_bytealign (w[60], w[61], offset); + w[62] = amd_bytealign (w[59], w[60], offset); + w[61] = amd_bytealign (w[58], w[59], offset); + w[60] = amd_bytealign (w[57], w[58], offset); + w[59] = amd_bytealign (w[56], w[57], offset); + w[58] = amd_bytealign (w[55], w[56], offset); + w[57] = amd_bytealign (w[54], w[55], offset); + w[56] = amd_bytealign (w[53], w[54], offset); + w[55] = amd_bytealign (w[52], w[53], offset); + w[54] = amd_bytealign (w[51], w[52], offset); + w[53] = amd_bytealign (w[50], w[51], offset); + w[52] = amd_bytealign (w[49], w[50], offset); + w[51] = amd_bytealign (w[48], w[49], offset); + w[50] = amd_bytealign (w[47], w[48], offset); + w[49] = amd_bytealign (w[46], w[47], offset); + w[48] = amd_bytealign (w[45], w[46], offset); + w[47] = amd_bytealign (w[44], w[45], offset); + w[46] = amd_bytealign (w[43], w[44], offset); + w[45] = amd_bytealign (w[42], w[43], offset); + w[44] = amd_bytealign (w[41], w[42], offset); + w[43] = amd_bytealign (w[40], w[41], offset); + w[42] = amd_bytealign (w[39], w[40], offset); + w[41] = amd_bytealign (w[38], w[39], offset); + w[40] = amd_bytealign (w[37], w[38], offset); + w[39] = amd_bytealign (w[36], w[37], offset); + w[38] = amd_bytealign (w[35], w[36], offset); + w[37] = amd_bytealign (w[34], w[35], offset); + w[36] = amd_bytealign (w[33], w[34], offset); + w[35] = amd_bytealign (w[32], w[33], offset); + w[34] = amd_bytealign (w[31], w[32], offset); + w[33] = amd_bytealign (w[30], w[31], offset); + w[32] = amd_bytealign (w[29], w[30], offset); + w[31] = amd_bytealign (w[28], w[29], offset); + w[30] = amd_bytealign (w[27], w[28], offset); + w[29] = amd_bytealign (w[26], w[27], offset); + w[28] = amd_bytealign (w[25], w[26], offset); + w[27] = amd_bytealign (w[24], w[25], offset); + w[26] = amd_bytealign (w[23], w[24], offset); + w[25] = amd_bytealign (w[22], w[23], offset); + w[24] = amd_bytealign (w[21], w[22], offset); + w[23] = amd_bytealign (w[20], w[21], offset); + w[22] = amd_bytealign (w[19], w[20], offset); + w[21] = amd_bytealign (w[18], w[19], offset); + w[20] = amd_bytealign (w[17], w[18], offset); + w[19] = amd_bytealign (w[16], w[17], offset); + w[18] = amd_bytealign (w[15], w[16], offset); + w[17] = amd_bytealign (w[14], w[15], offset); + w[16] = amd_bytealign (w[13], w[14], offset); + w[15] = amd_bytealign (w[12], w[13], offset); + w[14] = amd_bytealign (w[11], w[12], offset); + w[13] = amd_bytealign (w[10], w[11], offset); + w[12] = amd_bytealign (w[ 9], w[10], offset); + w[11] = amd_bytealign (w[ 8], w[ 9], offset); + w[10] = amd_bytealign (w[ 7], w[ 8], offset); + w[ 9] = amd_bytealign (w[ 6], w[ 7], offset); + w[ 8] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 7] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 6] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 5] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 4] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 3] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 2] = amd_bytealign ( 0, w[ 0], offset); + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 3: + w[63] = amd_bytealign (w[59], w[60], offset); + w[62] = amd_bytealign (w[58], w[59], offset); + w[61] = amd_bytealign (w[57], w[58], offset); + w[60] = amd_bytealign (w[56], w[57], offset); + w[59] = amd_bytealign (w[55], w[56], offset); + w[58] = amd_bytealign (w[54], w[55], offset); + w[57] = amd_bytealign (w[53], w[54], offset); + w[56] = amd_bytealign (w[52], w[53], offset); + w[55] = amd_bytealign (w[51], w[52], offset); + w[54] = amd_bytealign (w[50], w[51], offset); + w[53] = amd_bytealign (w[49], w[50], offset); + w[52] = amd_bytealign (w[48], w[49], offset); + w[51] = amd_bytealign (w[47], w[48], offset); + w[50] = amd_bytealign (w[46], w[47], offset); + w[49] = amd_bytealign (w[45], w[46], offset); + w[48] = amd_bytealign (w[44], w[45], offset); + w[47] = amd_bytealign (w[43], w[44], offset); + w[46] = amd_bytealign (w[42], w[43], offset); + w[45] = amd_bytealign (w[41], w[42], offset); + w[44] = amd_bytealign (w[40], w[41], offset); + w[43] = amd_bytealign (w[39], w[40], offset); + w[42] = amd_bytealign (w[38], w[39], offset); + w[41] = amd_bytealign (w[37], w[38], offset); + w[40] = amd_bytealign (w[36], w[37], offset); + w[39] = amd_bytealign (w[35], w[36], offset); + w[38] = amd_bytealign (w[34], w[35], offset); + w[37] = amd_bytealign (w[33], w[34], offset); + w[36] = amd_bytealign (w[32], w[33], offset); + w[35] = amd_bytealign (w[31], w[32], offset); + w[34] = amd_bytealign (w[30], w[31], offset); + w[33] = amd_bytealign (w[29], w[30], offset); + w[32] = amd_bytealign (w[28], w[29], offset); + w[31] = amd_bytealign (w[27], w[28], offset); + w[30] = amd_bytealign (w[26], w[27], offset); + w[29] = amd_bytealign (w[25], w[26], offset); + w[28] = amd_bytealign (w[24], w[25], offset); + w[27] = amd_bytealign (w[23], w[24], offset); + w[26] = amd_bytealign (w[22], w[23], offset); + w[25] = amd_bytealign (w[21], w[22], offset); + w[24] = amd_bytealign (w[20], w[21], offset); + w[23] = amd_bytealign (w[19], w[20], offset); + w[22] = amd_bytealign (w[18], w[19], offset); + w[21] = amd_bytealign (w[17], w[18], offset); + w[20] = amd_bytealign (w[16], w[17], offset); + w[19] = amd_bytealign (w[15], w[16], offset); + w[18] = amd_bytealign (w[14], w[15], offset); + w[17] = amd_bytealign (w[13], w[14], offset); + w[16] = amd_bytealign (w[12], w[13], offset); + w[15] = amd_bytealign (w[11], w[12], offset); + w[14] = amd_bytealign (w[10], w[11], offset); + w[13] = amd_bytealign (w[ 9], w[10], offset); + w[12] = amd_bytealign (w[ 8], w[ 9], offset); + w[11] = amd_bytealign (w[ 7], w[ 8], offset); + w[10] = amd_bytealign (w[ 6], w[ 7], offset); + w[ 9] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 8] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 7] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 6] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 5] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 4] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 3] = amd_bytealign ( 0, w[ 0], offset); + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 4: + w[63] = amd_bytealign (w[58], w[59], offset); + w[62] = amd_bytealign (w[57], w[58], offset); + w[61] = amd_bytealign (w[56], w[57], offset); + w[60] = amd_bytealign (w[55], w[56], offset); + w[59] = amd_bytealign (w[54], w[55], offset); + w[58] = amd_bytealign (w[53], w[54], offset); + w[57] = amd_bytealign (w[52], w[53], offset); + w[56] = amd_bytealign (w[51], w[52], offset); + w[55] = amd_bytealign (w[50], w[51], offset); + w[54] = amd_bytealign (w[49], w[50], offset); + w[53] = amd_bytealign (w[48], w[49], offset); + w[52] = amd_bytealign (w[47], w[48], offset); + w[51] = amd_bytealign (w[46], w[47], offset); + w[50] = amd_bytealign (w[45], w[46], offset); + w[49] = amd_bytealign (w[44], w[45], offset); + w[48] = amd_bytealign (w[43], w[44], offset); + w[47] = amd_bytealign (w[42], w[43], offset); + w[46] = amd_bytealign (w[41], w[42], offset); + w[45] = amd_bytealign (w[40], w[41], offset); + w[44] = amd_bytealign (w[39], w[40], offset); + w[43] = amd_bytealign (w[38], w[39], offset); + w[42] = amd_bytealign (w[37], w[38], offset); + w[41] = amd_bytealign (w[36], w[37], offset); + w[40] = amd_bytealign (w[35], w[36], offset); + w[39] = amd_bytealign (w[34], w[35], offset); + w[38] = amd_bytealign (w[33], w[34], offset); + w[37] = amd_bytealign (w[32], w[33], offset); + w[36] = amd_bytealign (w[31], w[32], offset); + w[35] = amd_bytealign (w[30], w[31], offset); + w[34] = amd_bytealign (w[29], w[30], offset); + w[33] = amd_bytealign (w[28], w[29], offset); + w[32] = amd_bytealign (w[27], w[28], offset); + w[31] = amd_bytealign (w[26], w[27], offset); + w[30] = amd_bytealign (w[25], w[26], offset); + w[29] = amd_bytealign (w[24], w[25], offset); + w[28] = amd_bytealign (w[23], w[24], offset); + w[27] = amd_bytealign (w[22], w[23], offset); + w[26] = amd_bytealign (w[21], w[22], offset); + w[25] = amd_bytealign (w[20], w[21], offset); + w[24] = amd_bytealign (w[19], w[20], offset); + w[23] = amd_bytealign (w[18], w[19], offset); + w[22] = amd_bytealign (w[17], w[18], offset); + w[21] = amd_bytealign (w[16], w[17], offset); + w[20] = amd_bytealign (w[15], w[16], offset); + w[19] = amd_bytealign (w[14], w[15], offset); + w[18] = amd_bytealign (w[13], w[14], offset); + w[17] = amd_bytealign (w[12], w[13], offset); + w[16] = amd_bytealign (w[11], w[12], offset); + w[15] = amd_bytealign (w[10], w[11], offset); + w[14] = amd_bytealign (w[ 9], w[10], offset); + w[13] = amd_bytealign (w[ 8], w[ 9], offset); + w[12] = amd_bytealign (w[ 7], w[ 8], offset); + w[11] = amd_bytealign (w[ 6], w[ 7], offset); + w[10] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 9] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 8] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 7] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 6] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 5] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 4] = amd_bytealign ( 0, w[ 0], offset); + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 5: + w[63] = amd_bytealign (w[57], w[58], offset); + w[62] = amd_bytealign (w[56], w[57], offset); + w[61] = amd_bytealign (w[55], w[56], offset); + w[60] = amd_bytealign (w[54], w[55], offset); + w[59] = amd_bytealign (w[53], w[54], offset); + w[58] = amd_bytealign (w[52], w[53], offset); + w[57] = amd_bytealign (w[51], w[52], offset); + w[56] = amd_bytealign (w[50], w[51], offset); + w[55] = amd_bytealign (w[49], w[50], offset); + w[54] = amd_bytealign (w[48], w[49], offset); + w[53] = amd_bytealign (w[47], w[48], offset); + w[52] = amd_bytealign (w[46], w[47], offset); + w[51] = amd_bytealign (w[45], w[46], offset); + w[50] = amd_bytealign (w[44], w[45], offset); + w[49] = amd_bytealign (w[43], w[44], offset); + w[48] = amd_bytealign (w[42], w[43], offset); + w[47] = amd_bytealign (w[41], w[42], offset); + w[46] = amd_bytealign (w[40], w[41], offset); + w[45] = amd_bytealign (w[39], w[40], offset); + w[44] = amd_bytealign (w[38], w[39], offset); + w[43] = amd_bytealign (w[37], w[38], offset); + w[42] = amd_bytealign (w[36], w[37], offset); + w[41] = amd_bytealign (w[35], w[36], offset); + w[40] = amd_bytealign (w[34], w[35], offset); + w[39] = amd_bytealign (w[33], w[34], offset); + w[38] = amd_bytealign (w[32], w[33], offset); + w[37] = amd_bytealign (w[31], w[32], offset); + w[36] = amd_bytealign (w[30], w[31], offset); + w[35] = amd_bytealign (w[29], w[30], offset); + w[34] = amd_bytealign (w[28], w[29], offset); + w[33] = amd_bytealign (w[27], w[28], offset); + w[32] = amd_bytealign (w[26], w[27], offset); + w[31] = amd_bytealign (w[25], w[26], offset); + w[30] = amd_bytealign (w[24], w[25], offset); + w[29] = amd_bytealign (w[23], w[24], offset); + w[28] = amd_bytealign (w[22], w[23], offset); + w[27] = amd_bytealign (w[21], w[22], offset); + w[26] = amd_bytealign (w[20], w[21], offset); + w[25] = amd_bytealign (w[19], w[20], offset); + w[24] = amd_bytealign (w[18], w[19], offset); + w[23] = amd_bytealign (w[17], w[18], offset); + w[22] = amd_bytealign (w[16], w[17], offset); + w[21] = amd_bytealign (w[15], w[16], offset); + w[20] = amd_bytealign (w[14], w[15], offset); + w[19] = amd_bytealign (w[13], w[14], offset); + w[18] = amd_bytealign (w[12], w[13], offset); + w[17] = amd_bytealign (w[11], w[12], offset); + w[16] = amd_bytealign (w[10], w[11], offset); + w[15] = amd_bytealign (w[ 9], w[10], offset); + w[14] = amd_bytealign (w[ 8], w[ 9], offset); + w[13] = amd_bytealign (w[ 7], w[ 8], offset); + w[12] = amd_bytealign (w[ 6], w[ 7], offset); + w[11] = amd_bytealign (w[ 5], w[ 6], offset); + w[10] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 9] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 8] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 7] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 6] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 5] = amd_bytealign ( 0, w[ 0], offset); + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 6: + w[63] = amd_bytealign (w[56], w[57], offset); + w[62] = amd_bytealign (w[55], w[56], offset); + w[61] = amd_bytealign (w[54], w[55], offset); + w[60] = amd_bytealign (w[53], w[54], offset); + w[59] = amd_bytealign (w[52], w[53], offset); + w[58] = amd_bytealign (w[51], w[52], offset); + w[57] = amd_bytealign (w[50], w[51], offset); + w[56] = amd_bytealign (w[49], w[50], offset); + w[55] = amd_bytealign (w[48], w[49], offset); + w[54] = amd_bytealign (w[47], w[48], offset); + w[53] = amd_bytealign (w[46], w[47], offset); + w[52] = amd_bytealign (w[45], w[46], offset); + w[51] = amd_bytealign (w[44], w[45], offset); + w[50] = amd_bytealign (w[43], w[44], offset); + w[49] = amd_bytealign (w[42], w[43], offset); + w[48] = amd_bytealign (w[41], w[42], offset); + w[47] = amd_bytealign (w[40], w[41], offset); + w[46] = amd_bytealign (w[39], w[40], offset); + w[45] = amd_bytealign (w[38], w[39], offset); + w[44] = amd_bytealign (w[37], w[38], offset); + w[43] = amd_bytealign (w[36], w[37], offset); + w[42] = amd_bytealign (w[35], w[36], offset); + w[41] = amd_bytealign (w[34], w[35], offset); + w[40] = amd_bytealign (w[33], w[34], offset); + w[39] = amd_bytealign (w[32], w[33], offset); + w[38] = amd_bytealign (w[31], w[32], offset); + w[37] = amd_bytealign (w[30], w[31], offset); + w[36] = amd_bytealign (w[29], w[30], offset); + w[35] = amd_bytealign (w[28], w[29], offset); + w[34] = amd_bytealign (w[27], w[28], offset); + w[33] = amd_bytealign (w[26], w[27], offset); + w[32] = amd_bytealign (w[25], w[26], offset); + w[31] = amd_bytealign (w[24], w[25], offset); + w[30] = amd_bytealign (w[23], w[24], offset); + w[29] = amd_bytealign (w[22], w[23], offset); + w[28] = amd_bytealign (w[21], w[22], offset); + w[27] = amd_bytealign (w[20], w[21], offset); + w[26] = amd_bytealign (w[19], w[20], offset); + w[25] = amd_bytealign (w[18], w[19], offset); + w[24] = amd_bytealign (w[17], w[18], offset); + w[23] = amd_bytealign (w[16], w[17], offset); + w[22] = amd_bytealign (w[15], w[16], offset); + w[21] = amd_bytealign (w[14], w[15], offset); + w[20] = amd_bytealign (w[13], w[14], offset); + w[19] = amd_bytealign (w[12], w[13], offset); + w[18] = amd_bytealign (w[11], w[12], offset); + w[17] = amd_bytealign (w[10], w[11], offset); + w[16] = amd_bytealign (w[ 9], w[10], offset); + w[15] = amd_bytealign (w[ 8], w[ 9], offset); + w[14] = amd_bytealign (w[ 7], w[ 8], offset); + w[13] = amd_bytealign (w[ 6], w[ 7], offset); + w[12] = amd_bytealign (w[ 5], w[ 6], offset); + w[11] = amd_bytealign (w[ 4], w[ 5], offset); + w[10] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 9] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 8] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 7] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 6] = amd_bytealign ( 0, w[ 0], offset); + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 7: + w[63] = amd_bytealign (w[55], w[56], offset); + w[62] = amd_bytealign (w[54], w[55], offset); + w[61] = amd_bytealign (w[53], w[54], offset); + w[60] = amd_bytealign (w[52], w[53], offset); + w[59] = amd_bytealign (w[51], w[52], offset); + w[58] = amd_bytealign (w[50], w[51], offset); + w[57] = amd_bytealign (w[49], w[50], offset); + w[56] = amd_bytealign (w[48], w[49], offset); + w[55] = amd_bytealign (w[47], w[48], offset); + w[54] = amd_bytealign (w[46], w[47], offset); + w[53] = amd_bytealign (w[45], w[46], offset); + w[52] = amd_bytealign (w[44], w[45], offset); + w[51] = amd_bytealign (w[43], w[44], offset); + w[50] = amd_bytealign (w[42], w[43], offset); + w[49] = amd_bytealign (w[41], w[42], offset); + w[48] = amd_bytealign (w[40], w[41], offset); + w[47] = amd_bytealign (w[39], w[40], offset); + w[46] = amd_bytealign (w[38], w[39], offset); + w[45] = amd_bytealign (w[37], w[38], offset); + w[44] = amd_bytealign (w[36], w[37], offset); + w[43] = amd_bytealign (w[35], w[36], offset); + w[42] = amd_bytealign (w[34], w[35], offset); + w[41] = amd_bytealign (w[33], w[34], offset); + w[40] = amd_bytealign (w[32], w[33], offset); + w[39] = amd_bytealign (w[31], w[32], offset); + w[38] = amd_bytealign (w[30], w[31], offset); + w[37] = amd_bytealign (w[29], w[30], offset); + w[36] = amd_bytealign (w[28], w[29], offset); + w[35] = amd_bytealign (w[27], w[28], offset); + w[34] = amd_bytealign (w[26], w[27], offset); + w[33] = amd_bytealign (w[25], w[26], offset); + w[32] = amd_bytealign (w[24], w[25], offset); + w[31] = amd_bytealign (w[23], w[24], offset); + w[30] = amd_bytealign (w[22], w[23], offset); + w[29] = amd_bytealign (w[21], w[22], offset); + w[28] = amd_bytealign (w[20], w[21], offset); + w[27] = amd_bytealign (w[19], w[20], offset); + w[26] = amd_bytealign (w[18], w[19], offset); + w[25] = amd_bytealign (w[17], w[18], offset); + w[24] = amd_bytealign (w[16], w[17], offset); + w[23] = amd_bytealign (w[15], w[16], offset); + w[22] = amd_bytealign (w[14], w[15], offset); + w[21] = amd_bytealign (w[13], w[14], offset); + w[20] = amd_bytealign (w[12], w[13], offset); + w[19] = amd_bytealign (w[11], w[12], offset); + w[18] = amd_bytealign (w[10], w[11], offset); + w[17] = amd_bytealign (w[ 9], w[10], offset); + w[16] = amd_bytealign (w[ 8], w[ 9], offset); + w[15] = amd_bytealign (w[ 7], w[ 8], offset); + w[14] = amd_bytealign (w[ 6], w[ 7], offset); + w[13] = amd_bytealign (w[ 5], w[ 6], offset); + w[12] = amd_bytealign (w[ 4], w[ 5], offset); + w[11] = amd_bytealign (w[ 3], w[ 4], offset); + w[10] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 9] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 8] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 7] = amd_bytealign ( 0, w[ 0], offset); + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 8: + w[63] = amd_bytealign (w[54], w[55], offset); + w[62] = amd_bytealign (w[53], w[54], offset); + w[61] = amd_bytealign (w[52], w[53], offset); + w[60] = amd_bytealign (w[51], w[52], offset); + w[59] = amd_bytealign (w[50], w[51], offset); + w[58] = amd_bytealign (w[49], w[50], offset); + w[57] = amd_bytealign (w[48], w[49], offset); + w[56] = amd_bytealign (w[47], w[48], offset); + w[55] = amd_bytealign (w[46], w[47], offset); + w[54] = amd_bytealign (w[45], w[46], offset); + w[53] = amd_bytealign (w[44], w[45], offset); + w[52] = amd_bytealign (w[43], w[44], offset); + w[51] = amd_bytealign (w[42], w[43], offset); + w[50] = amd_bytealign (w[41], w[42], offset); + w[49] = amd_bytealign (w[40], w[41], offset); + w[48] = amd_bytealign (w[39], w[40], offset); + w[47] = amd_bytealign (w[38], w[39], offset); + w[46] = amd_bytealign (w[37], w[38], offset); + w[45] = amd_bytealign (w[36], w[37], offset); + w[44] = amd_bytealign (w[35], w[36], offset); + w[43] = amd_bytealign (w[34], w[35], offset); + w[42] = amd_bytealign (w[33], w[34], offset); + w[41] = amd_bytealign (w[32], w[33], offset); + w[40] = amd_bytealign (w[31], w[32], offset); + w[39] = amd_bytealign (w[30], w[31], offset); + w[38] = amd_bytealign (w[29], w[30], offset); + w[37] = amd_bytealign (w[28], w[29], offset); + w[36] = amd_bytealign (w[27], w[28], offset); + w[35] = amd_bytealign (w[26], w[27], offset); + w[34] = amd_bytealign (w[25], w[26], offset); + w[33] = amd_bytealign (w[24], w[25], offset); + w[32] = amd_bytealign (w[23], w[24], offset); + w[31] = amd_bytealign (w[22], w[23], offset); + w[30] = amd_bytealign (w[21], w[22], offset); + w[29] = amd_bytealign (w[20], w[21], offset); + w[28] = amd_bytealign (w[19], w[20], offset); + w[27] = amd_bytealign (w[18], w[19], offset); + w[26] = amd_bytealign (w[17], w[18], offset); + w[25] = amd_bytealign (w[16], w[17], offset); + w[24] = amd_bytealign (w[15], w[16], offset); + w[23] = amd_bytealign (w[14], w[15], offset); + w[22] = amd_bytealign (w[13], w[14], offset); + w[21] = amd_bytealign (w[12], w[13], offset); + w[20] = amd_bytealign (w[11], w[12], offset); + w[19] = amd_bytealign (w[10], w[11], offset); + w[18] = amd_bytealign (w[ 9], w[10], offset); + w[17] = amd_bytealign (w[ 8], w[ 9], offset); + w[16] = amd_bytealign (w[ 7], w[ 8], offset); + w[15] = amd_bytealign (w[ 6], w[ 7], offset); + w[14] = amd_bytealign (w[ 5], w[ 6], offset); + w[13] = amd_bytealign (w[ 4], w[ 5], offset); + w[12] = amd_bytealign (w[ 3], w[ 4], offset); + w[11] = amd_bytealign (w[ 2], w[ 3], offset); + w[10] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 9] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 8] = amd_bytealign ( 0, w[ 0], offset); + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 9: + w[63] = amd_bytealign (w[53], w[54], offset); + w[62] = amd_bytealign (w[52], w[53], offset); + w[61] = amd_bytealign (w[51], w[52], offset); + w[60] = amd_bytealign (w[50], w[51], offset); + w[59] = amd_bytealign (w[49], w[50], offset); + w[58] = amd_bytealign (w[48], w[49], offset); + w[57] = amd_bytealign (w[47], w[48], offset); + w[56] = amd_bytealign (w[46], w[47], offset); + w[55] = amd_bytealign (w[45], w[46], offset); + w[54] = amd_bytealign (w[44], w[45], offset); + w[53] = amd_bytealign (w[43], w[44], offset); + w[52] = amd_bytealign (w[42], w[43], offset); + w[51] = amd_bytealign (w[41], w[42], offset); + w[50] = amd_bytealign (w[40], w[41], offset); + w[49] = amd_bytealign (w[39], w[40], offset); + w[48] = amd_bytealign (w[38], w[39], offset); + w[47] = amd_bytealign (w[37], w[38], offset); + w[46] = amd_bytealign (w[36], w[37], offset); + w[45] = amd_bytealign (w[35], w[36], offset); + w[44] = amd_bytealign (w[34], w[35], offset); + w[43] = amd_bytealign (w[33], w[34], offset); + w[42] = amd_bytealign (w[32], w[33], offset); + w[41] = amd_bytealign (w[31], w[32], offset); + w[40] = amd_bytealign (w[30], w[31], offset); + w[39] = amd_bytealign (w[29], w[30], offset); + w[38] = amd_bytealign (w[28], w[29], offset); + w[37] = amd_bytealign (w[27], w[28], offset); + w[36] = amd_bytealign (w[26], w[27], offset); + w[35] = amd_bytealign (w[25], w[26], offset); + w[34] = amd_bytealign (w[24], w[25], offset); + w[33] = amd_bytealign (w[23], w[24], offset); + w[32] = amd_bytealign (w[22], w[23], offset); + w[31] = amd_bytealign (w[21], w[22], offset); + w[30] = amd_bytealign (w[20], w[21], offset); + w[29] = amd_bytealign (w[19], w[20], offset); + w[28] = amd_bytealign (w[18], w[19], offset); + w[27] = amd_bytealign (w[17], w[18], offset); + w[26] = amd_bytealign (w[16], w[17], offset); + w[25] = amd_bytealign (w[15], w[16], offset); + w[24] = amd_bytealign (w[14], w[15], offset); + w[23] = amd_bytealign (w[13], w[14], offset); + w[22] = amd_bytealign (w[12], w[13], offset); + w[21] = amd_bytealign (w[11], w[12], offset); + w[20] = amd_bytealign (w[10], w[11], offset); + w[19] = amd_bytealign (w[ 9], w[10], offset); + w[18] = amd_bytealign (w[ 8], w[ 9], offset); + w[17] = amd_bytealign (w[ 7], w[ 8], offset); + w[16] = amd_bytealign (w[ 6], w[ 7], offset); + w[15] = amd_bytealign (w[ 5], w[ 6], offset); + w[14] = amd_bytealign (w[ 4], w[ 5], offset); + w[13] = amd_bytealign (w[ 3], w[ 4], offset); + w[12] = amd_bytealign (w[ 2], w[ 3], offset); + w[11] = amd_bytealign (w[ 1], w[ 2], offset); + w[10] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 9] = amd_bytealign ( 0, w[ 0], offset); + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 10: + w[63] = amd_bytealign (w[52], w[53], offset); + w[62] = amd_bytealign (w[51], w[52], offset); + w[61] = amd_bytealign (w[50], w[51], offset); + w[60] = amd_bytealign (w[49], w[50], offset); + w[59] = amd_bytealign (w[48], w[49], offset); + w[58] = amd_bytealign (w[47], w[48], offset); + w[57] = amd_bytealign (w[46], w[47], offset); + w[56] = amd_bytealign (w[45], w[46], offset); + w[55] = amd_bytealign (w[44], w[45], offset); + w[54] = amd_bytealign (w[43], w[44], offset); + w[53] = amd_bytealign (w[42], w[43], offset); + w[52] = amd_bytealign (w[41], w[42], offset); + w[51] = amd_bytealign (w[40], w[41], offset); + w[50] = amd_bytealign (w[39], w[40], offset); + w[49] = amd_bytealign (w[38], w[39], offset); + w[48] = amd_bytealign (w[37], w[38], offset); + w[47] = amd_bytealign (w[36], w[37], offset); + w[46] = amd_bytealign (w[35], w[36], offset); + w[45] = amd_bytealign (w[34], w[35], offset); + w[44] = amd_bytealign (w[33], w[34], offset); + w[43] = amd_bytealign (w[32], w[33], offset); + w[42] = amd_bytealign (w[31], w[32], offset); + w[41] = amd_bytealign (w[30], w[31], offset); + w[40] = amd_bytealign (w[29], w[30], offset); + w[39] = amd_bytealign (w[28], w[29], offset); + w[38] = amd_bytealign (w[27], w[28], offset); + w[37] = amd_bytealign (w[26], w[27], offset); + w[36] = amd_bytealign (w[25], w[26], offset); + w[35] = amd_bytealign (w[24], w[25], offset); + w[34] = amd_bytealign (w[23], w[24], offset); + w[33] = amd_bytealign (w[22], w[23], offset); + w[32] = amd_bytealign (w[21], w[22], offset); + w[31] = amd_bytealign (w[20], w[21], offset); + w[30] = amd_bytealign (w[19], w[20], offset); + w[29] = amd_bytealign (w[18], w[19], offset); + w[28] = amd_bytealign (w[17], w[18], offset); + w[27] = amd_bytealign (w[16], w[17], offset); + w[26] = amd_bytealign (w[15], w[16], offset); + w[25] = amd_bytealign (w[14], w[15], offset); + w[24] = amd_bytealign (w[13], w[14], offset); + w[23] = amd_bytealign (w[12], w[13], offset); + w[22] = amd_bytealign (w[11], w[12], offset); + w[21] = amd_bytealign (w[10], w[11], offset); + w[20] = amd_bytealign (w[ 9], w[10], offset); + w[19] = amd_bytealign (w[ 8], w[ 9], offset); + w[18] = amd_bytealign (w[ 7], w[ 8], offset); + w[17] = amd_bytealign (w[ 6], w[ 7], offset); + w[16] = amd_bytealign (w[ 5], w[ 6], offset); + w[15] = amd_bytealign (w[ 4], w[ 5], offset); + w[14] = amd_bytealign (w[ 3], w[ 4], offset); + w[13] = amd_bytealign (w[ 2], w[ 3], offset); + w[12] = amd_bytealign (w[ 1], w[ 2], offset); + w[11] = amd_bytealign (w[ 0], w[ 1], offset); + w[10] = amd_bytealign ( 0, w[ 0], offset); + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 11: + w[63] = amd_bytealign (w[51], w[52], offset); + w[62] = amd_bytealign (w[50], w[51], offset); + w[61] = amd_bytealign (w[49], w[50], offset); + w[60] = amd_bytealign (w[48], w[49], offset); + w[59] = amd_bytealign (w[47], w[48], offset); + w[58] = amd_bytealign (w[46], w[47], offset); + w[57] = amd_bytealign (w[45], w[46], offset); + w[56] = amd_bytealign (w[44], w[45], offset); + w[55] = amd_bytealign (w[43], w[44], offset); + w[54] = amd_bytealign (w[42], w[43], offset); + w[53] = amd_bytealign (w[41], w[42], offset); + w[52] = amd_bytealign (w[40], w[41], offset); + w[51] = amd_bytealign (w[39], w[40], offset); + w[50] = amd_bytealign (w[38], w[39], offset); + w[49] = amd_bytealign (w[37], w[38], offset); + w[48] = amd_bytealign (w[36], w[37], offset); + w[47] = amd_bytealign (w[35], w[36], offset); + w[46] = amd_bytealign (w[34], w[35], offset); + w[45] = amd_bytealign (w[33], w[34], offset); + w[44] = amd_bytealign (w[32], w[33], offset); + w[43] = amd_bytealign (w[31], w[32], offset); + w[42] = amd_bytealign (w[30], w[31], offset); + w[41] = amd_bytealign (w[29], w[30], offset); + w[40] = amd_bytealign (w[28], w[29], offset); + w[39] = amd_bytealign (w[27], w[28], offset); + w[38] = amd_bytealign (w[26], w[27], offset); + w[37] = amd_bytealign (w[25], w[26], offset); + w[36] = amd_bytealign (w[24], w[25], offset); + w[35] = amd_bytealign (w[23], w[24], offset); + w[34] = amd_bytealign (w[22], w[23], offset); + w[33] = amd_bytealign (w[21], w[22], offset); + w[32] = amd_bytealign (w[20], w[21], offset); + w[31] = amd_bytealign (w[19], w[20], offset); + w[30] = amd_bytealign (w[18], w[19], offset); + w[29] = amd_bytealign (w[17], w[18], offset); + w[28] = amd_bytealign (w[16], w[17], offset); + w[27] = amd_bytealign (w[15], w[16], offset); + w[26] = amd_bytealign (w[14], w[15], offset); + w[25] = amd_bytealign (w[13], w[14], offset); + w[24] = amd_bytealign (w[12], w[13], offset); + w[23] = amd_bytealign (w[11], w[12], offset); + w[22] = amd_bytealign (w[10], w[11], offset); + w[21] = amd_bytealign (w[ 9], w[10], offset); + w[20] = amd_bytealign (w[ 8], w[ 9], offset); + w[19] = amd_bytealign (w[ 7], w[ 8], offset); + w[18] = amd_bytealign (w[ 6], w[ 7], offset); + w[17] = amd_bytealign (w[ 5], w[ 6], offset); + w[16] = amd_bytealign (w[ 4], w[ 5], offset); + w[15] = amd_bytealign (w[ 3], w[ 4], offset); + w[14] = amd_bytealign (w[ 2], w[ 3], offset); + w[13] = amd_bytealign (w[ 1], w[ 2], offset); + w[12] = amd_bytealign (w[ 0], w[ 1], offset); + w[11] = amd_bytealign ( 0, w[ 0], offset); + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 12: + w[63] = amd_bytealign (w[50], w[51], offset); + w[62] = amd_bytealign (w[49], w[50], offset); + w[61] = amd_bytealign (w[48], w[49], offset); + w[60] = amd_bytealign (w[47], w[48], offset); + w[59] = amd_bytealign (w[46], w[47], offset); + w[58] = amd_bytealign (w[45], w[46], offset); + w[57] = amd_bytealign (w[44], w[45], offset); + w[56] = amd_bytealign (w[43], w[44], offset); + w[55] = amd_bytealign (w[42], w[43], offset); + w[54] = amd_bytealign (w[41], w[42], offset); + w[53] = amd_bytealign (w[40], w[41], offset); + w[52] = amd_bytealign (w[39], w[40], offset); + w[51] = amd_bytealign (w[38], w[39], offset); + w[50] = amd_bytealign (w[37], w[38], offset); + w[49] = amd_bytealign (w[36], w[37], offset); + w[48] = amd_bytealign (w[35], w[36], offset); + w[47] = amd_bytealign (w[34], w[35], offset); + w[46] = amd_bytealign (w[33], w[34], offset); + w[45] = amd_bytealign (w[32], w[33], offset); + w[44] = amd_bytealign (w[31], w[32], offset); + w[43] = amd_bytealign (w[30], w[31], offset); + w[42] = amd_bytealign (w[29], w[30], offset); + w[41] = amd_bytealign (w[28], w[29], offset); + w[40] = amd_bytealign (w[27], w[28], offset); + w[39] = amd_bytealign (w[26], w[27], offset); + w[38] = amd_bytealign (w[25], w[26], offset); + w[37] = amd_bytealign (w[24], w[25], offset); + w[36] = amd_bytealign (w[23], w[24], offset); + w[35] = amd_bytealign (w[22], w[23], offset); + w[34] = amd_bytealign (w[21], w[22], offset); + w[33] = amd_bytealign (w[20], w[21], offset); + w[32] = amd_bytealign (w[19], w[20], offset); + w[31] = amd_bytealign (w[18], w[19], offset); + w[30] = amd_bytealign (w[17], w[18], offset); + w[29] = amd_bytealign (w[16], w[17], offset); + w[28] = amd_bytealign (w[15], w[16], offset); + w[27] = amd_bytealign (w[14], w[15], offset); + w[26] = amd_bytealign (w[13], w[14], offset); + w[25] = amd_bytealign (w[12], w[13], offset); + w[24] = amd_bytealign (w[11], w[12], offset); + w[23] = amd_bytealign (w[10], w[11], offset); + w[22] = amd_bytealign (w[ 9], w[10], offset); + w[21] = amd_bytealign (w[ 8], w[ 9], offset); + w[20] = amd_bytealign (w[ 7], w[ 8], offset); + w[19] = amd_bytealign (w[ 6], w[ 7], offset); + w[18] = amd_bytealign (w[ 5], w[ 6], offset); + w[17] = amd_bytealign (w[ 4], w[ 5], offset); + w[16] = amd_bytealign (w[ 3], w[ 4], offset); + w[15] = amd_bytealign (w[ 2], w[ 3], offset); + w[14] = amd_bytealign (w[ 1], w[ 2], offset); + w[13] = amd_bytealign (w[ 0], w[ 1], offset); + w[12] = amd_bytealign ( 0, w[ 0], offset); + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 13: + w[63] = amd_bytealign (w[49], w[50], offset); + w[62] = amd_bytealign (w[48], w[49], offset); + w[61] = amd_bytealign (w[47], w[48], offset); + w[60] = amd_bytealign (w[46], w[47], offset); + w[59] = amd_bytealign (w[45], w[46], offset); + w[58] = amd_bytealign (w[44], w[45], offset); + w[57] = amd_bytealign (w[43], w[44], offset); + w[56] = amd_bytealign (w[42], w[43], offset); + w[55] = amd_bytealign (w[41], w[42], offset); + w[54] = amd_bytealign (w[40], w[41], offset); + w[53] = amd_bytealign (w[39], w[40], offset); + w[52] = amd_bytealign (w[38], w[39], offset); + w[51] = amd_bytealign (w[37], w[38], offset); + w[50] = amd_bytealign (w[36], w[37], offset); + w[49] = amd_bytealign (w[35], w[36], offset); + w[48] = amd_bytealign (w[34], w[35], offset); + w[47] = amd_bytealign (w[33], w[34], offset); + w[46] = amd_bytealign (w[32], w[33], offset); + w[45] = amd_bytealign (w[31], w[32], offset); + w[44] = amd_bytealign (w[30], w[31], offset); + w[43] = amd_bytealign (w[29], w[30], offset); + w[42] = amd_bytealign (w[28], w[29], offset); + w[41] = amd_bytealign (w[27], w[28], offset); + w[40] = amd_bytealign (w[26], w[27], offset); + w[39] = amd_bytealign (w[25], w[26], offset); + w[38] = amd_bytealign (w[24], w[25], offset); + w[37] = amd_bytealign (w[23], w[24], offset); + w[36] = amd_bytealign (w[22], w[23], offset); + w[35] = amd_bytealign (w[21], w[22], offset); + w[34] = amd_bytealign (w[20], w[21], offset); + w[33] = amd_bytealign (w[19], w[20], offset); + w[32] = amd_bytealign (w[18], w[19], offset); + w[31] = amd_bytealign (w[17], w[18], offset); + w[30] = amd_bytealign (w[16], w[17], offset); + w[29] = amd_bytealign (w[15], w[16], offset); + w[28] = amd_bytealign (w[14], w[15], offset); + w[27] = amd_bytealign (w[13], w[14], offset); + w[26] = amd_bytealign (w[12], w[13], offset); + w[25] = amd_bytealign (w[11], w[12], offset); + w[24] = amd_bytealign (w[10], w[11], offset); + w[23] = amd_bytealign (w[ 9], w[10], offset); + w[22] = amd_bytealign (w[ 8], w[ 9], offset); + w[21] = amd_bytealign (w[ 7], w[ 8], offset); + w[20] = amd_bytealign (w[ 6], w[ 7], offset); + w[19] = amd_bytealign (w[ 5], w[ 6], offset); + w[18] = amd_bytealign (w[ 4], w[ 5], offset); + w[17] = amd_bytealign (w[ 3], w[ 4], offset); + w[16] = amd_bytealign (w[ 2], w[ 3], offset); + w[15] = amd_bytealign (w[ 1], w[ 2], offset); + w[14] = amd_bytealign (w[ 0], w[ 1], offset); + w[13] = amd_bytealign ( 0, w[ 0], offset); + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 14: + w[63] = amd_bytealign (w[48], w[49], offset); + w[62] = amd_bytealign (w[47], w[48], offset); + w[61] = amd_bytealign (w[46], w[47], offset); + w[60] = amd_bytealign (w[45], w[46], offset); + w[59] = amd_bytealign (w[44], w[45], offset); + w[58] = amd_bytealign (w[43], w[44], offset); + w[57] = amd_bytealign (w[42], w[43], offset); + w[56] = amd_bytealign (w[41], w[42], offset); + w[55] = amd_bytealign (w[40], w[41], offset); + w[54] = amd_bytealign (w[39], w[40], offset); + w[53] = amd_bytealign (w[38], w[39], offset); + w[52] = amd_bytealign (w[37], w[38], offset); + w[51] = amd_bytealign (w[36], w[37], offset); + w[50] = amd_bytealign (w[35], w[36], offset); + w[49] = amd_bytealign (w[34], w[35], offset); + w[48] = amd_bytealign (w[33], w[34], offset); + w[47] = amd_bytealign (w[32], w[33], offset); + w[46] = amd_bytealign (w[31], w[32], offset); + w[45] = amd_bytealign (w[30], w[31], offset); + w[44] = amd_bytealign (w[29], w[30], offset); + w[43] = amd_bytealign (w[28], w[29], offset); + w[42] = amd_bytealign (w[27], w[28], offset); + w[41] = amd_bytealign (w[26], w[27], offset); + w[40] = amd_bytealign (w[25], w[26], offset); + w[39] = amd_bytealign (w[24], w[25], offset); + w[38] = amd_bytealign (w[23], w[24], offset); + w[37] = amd_bytealign (w[22], w[23], offset); + w[36] = amd_bytealign (w[21], w[22], offset); + w[35] = amd_bytealign (w[20], w[21], offset); + w[34] = amd_bytealign (w[19], w[20], offset); + w[33] = amd_bytealign (w[18], w[19], offset); + w[32] = amd_bytealign (w[17], w[18], offset); + w[31] = amd_bytealign (w[16], w[17], offset); + w[30] = amd_bytealign (w[15], w[16], offset); + w[29] = amd_bytealign (w[14], w[15], offset); + w[28] = amd_bytealign (w[13], w[14], offset); + w[27] = amd_bytealign (w[12], w[13], offset); + w[26] = amd_bytealign (w[11], w[12], offset); + w[25] = amd_bytealign (w[10], w[11], offset); + w[24] = amd_bytealign (w[ 9], w[10], offset); + w[23] = amd_bytealign (w[ 8], w[ 9], offset); + w[22] = amd_bytealign (w[ 7], w[ 8], offset); + w[21] = amd_bytealign (w[ 6], w[ 7], offset); + w[20] = amd_bytealign (w[ 5], w[ 6], offset); + w[19] = amd_bytealign (w[ 4], w[ 5], offset); + w[18] = amd_bytealign (w[ 3], w[ 4], offset); + w[17] = amd_bytealign (w[ 2], w[ 3], offset); + w[16] = amd_bytealign (w[ 1], w[ 2], offset); + w[15] = amd_bytealign (w[ 0], w[ 1], offset); + w[14] = amd_bytealign ( 0, w[ 0], offset); + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 15: + w[63] = amd_bytealign (w[47], w[48], offset); + w[62] = amd_bytealign (w[46], w[47], offset); + w[61] = amd_bytealign (w[45], w[46], offset); + w[60] = amd_bytealign (w[44], w[45], offset); + w[59] = amd_bytealign (w[43], w[44], offset); + w[58] = amd_bytealign (w[42], w[43], offset); + w[57] = amd_bytealign (w[41], w[42], offset); + w[56] = amd_bytealign (w[40], w[41], offset); + w[55] = amd_bytealign (w[39], w[40], offset); + w[54] = amd_bytealign (w[38], w[39], offset); + w[53] = amd_bytealign (w[37], w[38], offset); + w[52] = amd_bytealign (w[36], w[37], offset); + w[51] = amd_bytealign (w[35], w[36], offset); + w[50] = amd_bytealign (w[34], w[35], offset); + w[49] = amd_bytealign (w[33], w[34], offset); + w[48] = amd_bytealign (w[32], w[33], offset); + w[47] = amd_bytealign (w[31], w[32], offset); + w[46] = amd_bytealign (w[30], w[31], offset); + w[45] = amd_bytealign (w[29], w[30], offset); + w[44] = amd_bytealign (w[28], w[29], offset); + w[43] = amd_bytealign (w[27], w[28], offset); + w[42] = amd_bytealign (w[26], w[27], offset); + w[41] = amd_bytealign (w[25], w[26], offset); + w[40] = amd_bytealign (w[24], w[25], offset); + w[39] = amd_bytealign (w[23], w[24], offset); + w[38] = amd_bytealign (w[22], w[23], offset); + w[37] = amd_bytealign (w[21], w[22], offset); + w[36] = amd_bytealign (w[20], w[21], offset); + w[35] = amd_bytealign (w[19], w[20], offset); + w[34] = amd_bytealign (w[18], w[19], offset); + w[33] = amd_bytealign (w[17], w[18], offset); + w[32] = amd_bytealign (w[16], w[17], offset); + w[31] = amd_bytealign (w[15], w[16], offset); + w[30] = amd_bytealign (w[14], w[15], offset); + w[29] = amd_bytealign (w[13], w[14], offset); + w[28] = amd_bytealign (w[12], w[13], offset); + w[27] = amd_bytealign (w[11], w[12], offset); + w[26] = amd_bytealign (w[10], w[11], offset); + w[25] = amd_bytealign (w[ 9], w[10], offset); + w[24] = amd_bytealign (w[ 8], w[ 9], offset); + w[23] = amd_bytealign (w[ 7], w[ 8], offset); + w[22] = amd_bytealign (w[ 6], w[ 7], offset); + w[21] = amd_bytealign (w[ 5], w[ 6], offset); + w[20] = amd_bytealign (w[ 4], w[ 5], offset); + w[19] = amd_bytealign (w[ 3], w[ 4], offset); + w[18] = amd_bytealign (w[ 2], w[ 3], offset); + w[17] = amd_bytealign (w[ 1], w[ 2], offset); + w[16] = amd_bytealign (w[ 0], w[ 1], offset); + w[15] = amd_bytealign ( 0, w[ 0], offset); + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 16: + w[63] = amd_bytealign (w[46], w[47], offset); + w[62] = amd_bytealign (w[45], w[46], offset); + w[61] = amd_bytealign (w[44], w[45], offset); + w[60] = amd_bytealign (w[43], w[44], offset); + w[59] = amd_bytealign (w[42], w[43], offset); + w[58] = amd_bytealign (w[41], w[42], offset); + w[57] = amd_bytealign (w[40], w[41], offset); + w[56] = amd_bytealign (w[39], w[40], offset); + w[55] = amd_bytealign (w[38], w[39], offset); + w[54] = amd_bytealign (w[37], w[38], offset); + w[53] = amd_bytealign (w[36], w[37], offset); + w[52] = amd_bytealign (w[35], w[36], offset); + w[51] = amd_bytealign (w[34], w[35], offset); + w[50] = amd_bytealign (w[33], w[34], offset); + w[49] = amd_bytealign (w[32], w[33], offset); + w[48] = amd_bytealign (w[31], w[32], offset); + w[47] = amd_bytealign (w[30], w[31], offset); + w[46] = amd_bytealign (w[29], w[30], offset); + w[45] = amd_bytealign (w[28], w[29], offset); + w[44] = amd_bytealign (w[27], w[28], offset); + w[43] = amd_bytealign (w[26], w[27], offset); + w[42] = amd_bytealign (w[25], w[26], offset); + w[41] = amd_bytealign (w[24], w[25], offset); + w[40] = amd_bytealign (w[23], w[24], offset); + w[39] = amd_bytealign (w[22], w[23], offset); + w[38] = amd_bytealign (w[21], w[22], offset); + w[37] = amd_bytealign (w[20], w[21], offset); + w[36] = amd_bytealign (w[19], w[20], offset); + w[35] = amd_bytealign (w[18], w[19], offset); + w[34] = amd_bytealign (w[17], w[18], offset); + w[33] = amd_bytealign (w[16], w[17], offset); + w[32] = amd_bytealign (w[15], w[16], offset); + w[31] = amd_bytealign (w[14], w[15], offset); + w[30] = amd_bytealign (w[13], w[14], offset); + w[29] = amd_bytealign (w[12], w[13], offset); + w[28] = amd_bytealign (w[11], w[12], offset); + w[27] = amd_bytealign (w[10], w[11], offset); + w[26] = amd_bytealign (w[ 9], w[10], offset); + w[25] = amd_bytealign (w[ 8], w[ 9], offset); + w[24] = amd_bytealign (w[ 7], w[ 8], offset); + w[23] = amd_bytealign (w[ 6], w[ 7], offset); + w[22] = amd_bytealign (w[ 5], w[ 6], offset); + w[21] = amd_bytealign (w[ 4], w[ 5], offset); + w[20] = amd_bytealign (w[ 3], w[ 4], offset); + w[19] = amd_bytealign (w[ 2], w[ 3], offset); + w[18] = amd_bytealign (w[ 1], w[ 2], offset); + w[17] = amd_bytealign (w[ 0], w[ 1], offset); + w[16] = amd_bytealign ( 0, w[ 0], offset); + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 17: + w[63] = amd_bytealign (w[45], w[46], offset); + w[62] = amd_bytealign (w[44], w[45], offset); + w[61] = amd_bytealign (w[43], w[44], offset); + w[60] = amd_bytealign (w[42], w[43], offset); + w[59] = amd_bytealign (w[41], w[42], offset); + w[58] = amd_bytealign (w[40], w[41], offset); + w[57] = amd_bytealign (w[39], w[40], offset); + w[56] = amd_bytealign (w[38], w[39], offset); + w[55] = amd_bytealign (w[37], w[38], offset); + w[54] = amd_bytealign (w[36], w[37], offset); + w[53] = amd_bytealign (w[35], w[36], offset); + w[52] = amd_bytealign (w[34], w[35], offset); + w[51] = amd_bytealign (w[33], w[34], offset); + w[50] = amd_bytealign (w[32], w[33], offset); + w[49] = amd_bytealign (w[31], w[32], offset); + w[48] = amd_bytealign (w[30], w[31], offset); + w[47] = amd_bytealign (w[29], w[30], offset); + w[46] = amd_bytealign (w[28], w[29], offset); + w[45] = amd_bytealign (w[27], w[28], offset); + w[44] = amd_bytealign (w[26], w[27], offset); + w[43] = amd_bytealign (w[25], w[26], offset); + w[42] = amd_bytealign (w[24], w[25], offset); + w[41] = amd_bytealign (w[23], w[24], offset); + w[40] = amd_bytealign (w[22], w[23], offset); + w[39] = amd_bytealign (w[21], w[22], offset); + w[38] = amd_bytealign (w[20], w[21], offset); + w[37] = amd_bytealign (w[19], w[20], offset); + w[36] = amd_bytealign (w[18], w[19], offset); + w[35] = amd_bytealign (w[17], w[18], offset); + w[34] = amd_bytealign (w[16], w[17], offset); + w[33] = amd_bytealign (w[15], w[16], offset); + w[32] = amd_bytealign (w[14], w[15], offset); + w[31] = amd_bytealign (w[13], w[14], offset); + w[30] = amd_bytealign (w[12], w[13], offset); + w[29] = amd_bytealign (w[11], w[12], offset); + w[28] = amd_bytealign (w[10], w[11], offset); + w[27] = amd_bytealign (w[ 9], w[10], offset); + w[26] = amd_bytealign (w[ 8], w[ 9], offset); + w[25] = amd_bytealign (w[ 7], w[ 8], offset); + w[24] = amd_bytealign (w[ 6], w[ 7], offset); + w[23] = amd_bytealign (w[ 5], w[ 6], offset); + w[22] = amd_bytealign (w[ 4], w[ 5], offset); + w[21] = amd_bytealign (w[ 3], w[ 4], offset); + w[20] = amd_bytealign (w[ 2], w[ 3], offset); + w[19] = amd_bytealign (w[ 1], w[ 2], offset); + w[18] = amd_bytealign (w[ 0], w[ 1], offset); + w[17] = amd_bytealign ( 0, w[ 0], offset); + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 18: + w[63] = amd_bytealign (w[44], w[45], offset); + w[62] = amd_bytealign (w[43], w[44], offset); + w[61] = amd_bytealign (w[42], w[43], offset); + w[60] = amd_bytealign (w[41], w[42], offset); + w[59] = amd_bytealign (w[40], w[41], offset); + w[58] = amd_bytealign (w[39], w[40], offset); + w[57] = amd_bytealign (w[38], w[39], offset); + w[56] = amd_bytealign (w[37], w[38], offset); + w[55] = amd_bytealign (w[36], w[37], offset); + w[54] = amd_bytealign (w[35], w[36], offset); + w[53] = amd_bytealign (w[34], w[35], offset); + w[52] = amd_bytealign (w[33], w[34], offset); + w[51] = amd_bytealign (w[32], w[33], offset); + w[50] = amd_bytealign (w[31], w[32], offset); + w[49] = amd_bytealign (w[30], w[31], offset); + w[48] = amd_bytealign (w[29], w[30], offset); + w[47] = amd_bytealign (w[28], w[29], offset); + w[46] = amd_bytealign (w[27], w[28], offset); + w[45] = amd_bytealign (w[26], w[27], offset); + w[44] = amd_bytealign (w[25], w[26], offset); + w[43] = amd_bytealign (w[24], w[25], offset); + w[42] = amd_bytealign (w[23], w[24], offset); + w[41] = amd_bytealign (w[22], w[23], offset); + w[40] = amd_bytealign (w[21], w[22], offset); + w[39] = amd_bytealign (w[20], w[21], offset); + w[38] = amd_bytealign (w[19], w[20], offset); + w[37] = amd_bytealign (w[18], w[19], offset); + w[36] = amd_bytealign (w[17], w[18], offset); + w[35] = amd_bytealign (w[16], w[17], offset); + w[34] = amd_bytealign (w[15], w[16], offset); + w[33] = amd_bytealign (w[14], w[15], offset); + w[32] = amd_bytealign (w[13], w[14], offset); + w[31] = amd_bytealign (w[12], w[13], offset); + w[30] = amd_bytealign (w[11], w[12], offset); + w[29] = amd_bytealign (w[10], w[11], offset); + w[28] = amd_bytealign (w[ 9], w[10], offset); + w[27] = amd_bytealign (w[ 8], w[ 9], offset); + w[26] = amd_bytealign (w[ 7], w[ 8], offset); + w[25] = amd_bytealign (w[ 6], w[ 7], offset); + w[24] = amd_bytealign (w[ 5], w[ 6], offset); + w[23] = amd_bytealign (w[ 4], w[ 5], offset); + w[22] = amd_bytealign (w[ 3], w[ 4], offset); + w[21] = amd_bytealign (w[ 2], w[ 3], offset); + w[20] = amd_bytealign (w[ 1], w[ 2], offset); + w[19] = amd_bytealign (w[ 0], w[ 1], offset); + w[18] = amd_bytealign ( 0, w[ 0], offset); + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 19: + w[63] = amd_bytealign (w[43], w[44], offset); + w[62] = amd_bytealign (w[42], w[43], offset); + w[61] = amd_bytealign (w[41], w[42], offset); + w[60] = amd_bytealign (w[40], w[41], offset); + w[59] = amd_bytealign (w[39], w[40], offset); + w[58] = amd_bytealign (w[38], w[39], offset); + w[57] = amd_bytealign (w[37], w[38], offset); + w[56] = amd_bytealign (w[36], w[37], offset); + w[55] = amd_bytealign (w[35], w[36], offset); + w[54] = amd_bytealign (w[34], w[35], offset); + w[53] = amd_bytealign (w[33], w[34], offset); + w[52] = amd_bytealign (w[32], w[33], offset); + w[51] = amd_bytealign (w[31], w[32], offset); + w[50] = amd_bytealign (w[30], w[31], offset); + w[49] = amd_bytealign (w[29], w[30], offset); + w[48] = amd_bytealign (w[28], w[29], offset); + w[47] = amd_bytealign (w[27], w[28], offset); + w[46] = amd_bytealign (w[26], w[27], offset); + w[45] = amd_bytealign (w[25], w[26], offset); + w[44] = amd_bytealign (w[24], w[25], offset); + w[43] = amd_bytealign (w[23], w[24], offset); + w[42] = amd_bytealign (w[22], w[23], offset); + w[41] = amd_bytealign (w[21], w[22], offset); + w[40] = amd_bytealign (w[20], w[21], offset); + w[39] = amd_bytealign (w[19], w[20], offset); + w[38] = amd_bytealign (w[18], w[19], offset); + w[37] = amd_bytealign (w[17], w[18], offset); + w[36] = amd_bytealign (w[16], w[17], offset); + w[35] = amd_bytealign (w[15], w[16], offset); + w[34] = amd_bytealign (w[14], w[15], offset); + w[33] = amd_bytealign (w[13], w[14], offset); + w[32] = amd_bytealign (w[12], w[13], offset); + w[31] = amd_bytealign (w[11], w[12], offset); + w[30] = amd_bytealign (w[10], w[11], offset); + w[29] = amd_bytealign (w[ 9], w[10], offset); + w[28] = amd_bytealign (w[ 8], w[ 9], offset); + w[27] = amd_bytealign (w[ 7], w[ 8], offset); + w[26] = amd_bytealign (w[ 6], w[ 7], offset); + w[25] = amd_bytealign (w[ 5], w[ 6], offset); + w[24] = amd_bytealign (w[ 4], w[ 5], offset); + w[23] = amd_bytealign (w[ 3], w[ 4], offset); + w[22] = amd_bytealign (w[ 2], w[ 3], offset); + w[21] = amd_bytealign (w[ 1], w[ 2], offset); + w[20] = amd_bytealign (w[ 0], w[ 1], offset); + w[19] = amd_bytealign ( 0, w[ 0], offset); + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 20: + w[63] = amd_bytealign (w[42], w[43], offset); + w[62] = amd_bytealign (w[41], w[42], offset); + w[61] = amd_bytealign (w[40], w[41], offset); + w[60] = amd_bytealign (w[39], w[40], offset); + w[59] = amd_bytealign (w[38], w[39], offset); + w[58] = amd_bytealign (w[37], w[38], offset); + w[57] = amd_bytealign (w[36], w[37], offset); + w[56] = amd_bytealign (w[35], w[36], offset); + w[55] = amd_bytealign (w[34], w[35], offset); + w[54] = amd_bytealign (w[33], w[34], offset); + w[53] = amd_bytealign (w[32], w[33], offset); + w[52] = amd_bytealign (w[31], w[32], offset); + w[51] = amd_bytealign (w[30], w[31], offset); + w[50] = amd_bytealign (w[29], w[30], offset); + w[49] = amd_bytealign (w[28], w[29], offset); + w[48] = amd_bytealign (w[27], w[28], offset); + w[47] = amd_bytealign (w[26], w[27], offset); + w[46] = amd_bytealign (w[25], w[26], offset); + w[45] = amd_bytealign (w[24], w[25], offset); + w[44] = amd_bytealign (w[23], w[24], offset); + w[43] = amd_bytealign (w[22], w[23], offset); + w[42] = amd_bytealign (w[21], w[22], offset); + w[41] = amd_bytealign (w[20], w[21], offset); + w[40] = amd_bytealign (w[19], w[20], offset); + w[39] = amd_bytealign (w[18], w[19], offset); + w[38] = amd_bytealign (w[17], w[18], offset); + w[37] = amd_bytealign (w[16], w[17], offset); + w[36] = amd_bytealign (w[15], w[16], offset); + w[35] = amd_bytealign (w[14], w[15], offset); + w[34] = amd_bytealign (w[13], w[14], offset); + w[33] = amd_bytealign (w[12], w[13], offset); + w[32] = amd_bytealign (w[11], w[12], offset); + w[31] = amd_bytealign (w[10], w[11], offset); + w[30] = amd_bytealign (w[ 9], w[10], offset); + w[29] = amd_bytealign (w[ 8], w[ 9], offset); + w[28] = amd_bytealign (w[ 7], w[ 8], offset); + w[27] = amd_bytealign (w[ 6], w[ 7], offset); + w[26] = amd_bytealign (w[ 5], w[ 6], offset); + w[25] = amd_bytealign (w[ 4], w[ 5], offset); + w[24] = amd_bytealign (w[ 3], w[ 4], offset); + w[23] = amd_bytealign (w[ 2], w[ 3], offset); + w[22] = amd_bytealign (w[ 1], w[ 2], offset); + w[21] = amd_bytealign (w[ 0], w[ 1], offset); + w[20] = amd_bytealign ( 0, w[ 0], offset); + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 21: + w[63] = amd_bytealign (w[41], w[42], offset); + w[62] = amd_bytealign (w[40], w[41], offset); + w[61] = amd_bytealign (w[39], w[40], offset); + w[60] = amd_bytealign (w[38], w[39], offset); + w[59] = amd_bytealign (w[37], w[38], offset); + w[58] = amd_bytealign (w[36], w[37], offset); + w[57] = amd_bytealign (w[35], w[36], offset); + w[56] = amd_bytealign (w[34], w[35], offset); + w[55] = amd_bytealign (w[33], w[34], offset); + w[54] = amd_bytealign (w[32], w[33], offset); + w[53] = amd_bytealign (w[31], w[32], offset); + w[52] = amd_bytealign (w[30], w[31], offset); + w[51] = amd_bytealign (w[29], w[30], offset); + w[50] = amd_bytealign (w[28], w[29], offset); + w[49] = amd_bytealign (w[27], w[28], offset); + w[48] = amd_bytealign (w[26], w[27], offset); + w[47] = amd_bytealign (w[25], w[26], offset); + w[46] = amd_bytealign (w[24], w[25], offset); + w[45] = amd_bytealign (w[23], w[24], offset); + w[44] = amd_bytealign (w[22], w[23], offset); + w[43] = amd_bytealign (w[21], w[22], offset); + w[42] = amd_bytealign (w[20], w[21], offset); + w[41] = amd_bytealign (w[19], w[20], offset); + w[40] = amd_bytealign (w[18], w[19], offset); + w[39] = amd_bytealign (w[17], w[18], offset); + w[38] = amd_bytealign (w[16], w[17], offset); + w[37] = amd_bytealign (w[15], w[16], offset); + w[36] = amd_bytealign (w[14], w[15], offset); + w[35] = amd_bytealign (w[13], w[14], offset); + w[34] = amd_bytealign (w[12], w[13], offset); + w[33] = amd_bytealign (w[11], w[12], offset); + w[32] = amd_bytealign (w[10], w[11], offset); + w[31] = amd_bytealign (w[ 9], w[10], offset); + w[30] = amd_bytealign (w[ 8], w[ 9], offset); + w[29] = amd_bytealign (w[ 7], w[ 8], offset); + w[28] = amd_bytealign (w[ 6], w[ 7], offset); + w[27] = amd_bytealign (w[ 5], w[ 6], offset); + w[26] = amd_bytealign (w[ 4], w[ 5], offset); + w[25] = amd_bytealign (w[ 3], w[ 4], offset); + w[24] = amd_bytealign (w[ 2], w[ 3], offset); + w[23] = amd_bytealign (w[ 1], w[ 2], offset); + w[22] = amd_bytealign (w[ 0], w[ 1], offset); + w[21] = amd_bytealign ( 0, w[ 0], offset); + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 22: + w[63] = amd_bytealign (w[40], w[41], offset); + w[62] = amd_bytealign (w[39], w[40], offset); + w[61] = amd_bytealign (w[38], w[39], offset); + w[60] = amd_bytealign (w[37], w[38], offset); + w[59] = amd_bytealign (w[36], w[37], offset); + w[58] = amd_bytealign (w[35], w[36], offset); + w[57] = amd_bytealign (w[34], w[35], offset); + w[56] = amd_bytealign (w[33], w[34], offset); + w[55] = amd_bytealign (w[32], w[33], offset); + w[54] = amd_bytealign (w[31], w[32], offset); + w[53] = amd_bytealign (w[30], w[31], offset); + w[52] = amd_bytealign (w[29], w[30], offset); + w[51] = amd_bytealign (w[28], w[29], offset); + w[50] = amd_bytealign (w[27], w[28], offset); + w[49] = amd_bytealign (w[26], w[27], offset); + w[48] = amd_bytealign (w[25], w[26], offset); + w[47] = amd_bytealign (w[24], w[25], offset); + w[46] = amd_bytealign (w[23], w[24], offset); + w[45] = amd_bytealign (w[22], w[23], offset); + w[44] = amd_bytealign (w[21], w[22], offset); + w[43] = amd_bytealign (w[20], w[21], offset); + w[42] = amd_bytealign (w[19], w[20], offset); + w[41] = amd_bytealign (w[18], w[19], offset); + w[40] = amd_bytealign (w[17], w[18], offset); + w[39] = amd_bytealign (w[16], w[17], offset); + w[38] = amd_bytealign (w[15], w[16], offset); + w[37] = amd_bytealign (w[14], w[15], offset); + w[36] = amd_bytealign (w[13], w[14], offset); + w[35] = amd_bytealign (w[12], w[13], offset); + w[34] = amd_bytealign (w[11], w[12], offset); + w[33] = amd_bytealign (w[10], w[11], offset); + w[32] = amd_bytealign (w[ 9], w[10], offset); + w[31] = amd_bytealign (w[ 8], w[ 9], offset); + w[30] = amd_bytealign (w[ 7], w[ 8], offset); + w[29] = amd_bytealign (w[ 6], w[ 7], offset); + w[28] = amd_bytealign (w[ 5], w[ 6], offset); + w[27] = amd_bytealign (w[ 4], w[ 5], offset); + w[26] = amd_bytealign (w[ 3], w[ 4], offset); + w[25] = amd_bytealign (w[ 2], w[ 3], offset); + w[24] = amd_bytealign (w[ 1], w[ 2], offset); + w[23] = amd_bytealign (w[ 0], w[ 1], offset); + w[22] = amd_bytealign ( 0, w[ 0], offset); + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 23: + w[63] = amd_bytealign (w[39], w[40], offset); + w[62] = amd_bytealign (w[38], w[39], offset); + w[61] = amd_bytealign (w[37], w[38], offset); + w[60] = amd_bytealign (w[36], w[37], offset); + w[59] = amd_bytealign (w[35], w[36], offset); + w[58] = amd_bytealign (w[34], w[35], offset); + w[57] = amd_bytealign (w[33], w[34], offset); + w[56] = amd_bytealign (w[32], w[33], offset); + w[55] = amd_bytealign (w[31], w[32], offset); + w[54] = amd_bytealign (w[30], w[31], offset); + w[53] = amd_bytealign (w[29], w[30], offset); + w[52] = amd_bytealign (w[28], w[29], offset); + w[51] = amd_bytealign (w[27], w[28], offset); + w[50] = amd_bytealign (w[26], w[27], offset); + w[49] = amd_bytealign (w[25], w[26], offset); + w[48] = amd_bytealign (w[24], w[25], offset); + w[47] = amd_bytealign (w[23], w[24], offset); + w[46] = amd_bytealign (w[22], w[23], offset); + w[45] = amd_bytealign (w[21], w[22], offset); + w[44] = amd_bytealign (w[20], w[21], offset); + w[43] = amd_bytealign (w[19], w[20], offset); + w[42] = amd_bytealign (w[18], w[19], offset); + w[41] = amd_bytealign (w[17], w[18], offset); + w[40] = amd_bytealign (w[16], w[17], offset); + w[39] = amd_bytealign (w[15], w[16], offset); + w[38] = amd_bytealign (w[14], w[15], offset); + w[37] = amd_bytealign (w[13], w[14], offset); + w[36] = amd_bytealign (w[12], w[13], offset); + w[35] = amd_bytealign (w[11], w[12], offset); + w[34] = amd_bytealign (w[10], w[11], offset); + w[33] = amd_bytealign (w[ 9], w[10], offset); + w[32] = amd_bytealign (w[ 8], w[ 9], offset); + w[31] = amd_bytealign (w[ 7], w[ 8], offset); + w[30] = amd_bytealign (w[ 6], w[ 7], offset); + w[29] = amd_bytealign (w[ 5], w[ 6], offset); + w[28] = amd_bytealign (w[ 4], w[ 5], offset); + w[27] = amd_bytealign (w[ 3], w[ 4], offset); + w[26] = amd_bytealign (w[ 2], w[ 3], offset); + w[25] = amd_bytealign (w[ 1], w[ 2], offset); + w[24] = amd_bytealign (w[ 0], w[ 1], offset); + w[23] = amd_bytealign ( 0, w[ 0], offset); + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 24: + w[63] = amd_bytealign (w[38], w[39], offset); + w[62] = amd_bytealign (w[37], w[38], offset); + w[61] = amd_bytealign (w[36], w[37], offset); + w[60] = amd_bytealign (w[35], w[36], offset); + w[59] = amd_bytealign (w[34], w[35], offset); + w[58] = amd_bytealign (w[33], w[34], offset); + w[57] = amd_bytealign (w[32], w[33], offset); + w[56] = amd_bytealign (w[31], w[32], offset); + w[55] = amd_bytealign (w[30], w[31], offset); + w[54] = amd_bytealign (w[29], w[30], offset); + w[53] = amd_bytealign (w[28], w[29], offset); + w[52] = amd_bytealign (w[27], w[28], offset); + w[51] = amd_bytealign (w[26], w[27], offset); + w[50] = amd_bytealign (w[25], w[26], offset); + w[49] = amd_bytealign (w[24], w[25], offset); + w[48] = amd_bytealign (w[23], w[24], offset); + w[47] = amd_bytealign (w[22], w[23], offset); + w[46] = amd_bytealign (w[21], w[22], offset); + w[45] = amd_bytealign (w[20], w[21], offset); + w[44] = amd_bytealign (w[19], w[20], offset); + w[43] = amd_bytealign (w[18], w[19], offset); + w[42] = amd_bytealign (w[17], w[18], offset); + w[41] = amd_bytealign (w[16], w[17], offset); + w[40] = amd_bytealign (w[15], w[16], offset); + w[39] = amd_bytealign (w[14], w[15], offset); + w[38] = amd_bytealign (w[13], w[14], offset); + w[37] = amd_bytealign (w[12], w[13], offset); + w[36] = amd_bytealign (w[11], w[12], offset); + w[35] = amd_bytealign (w[10], w[11], offset); + w[34] = amd_bytealign (w[ 9], w[10], offset); + w[33] = amd_bytealign (w[ 8], w[ 9], offset); + w[32] = amd_bytealign (w[ 7], w[ 8], offset); + w[31] = amd_bytealign (w[ 6], w[ 7], offset); + w[30] = amd_bytealign (w[ 5], w[ 6], offset); + w[29] = amd_bytealign (w[ 4], w[ 5], offset); + w[28] = amd_bytealign (w[ 3], w[ 4], offset); + w[27] = amd_bytealign (w[ 2], w[ 3], offset); + w[26] = amd_bytealign (w[ 1], w[ 2], offset); + w[25] = amd_bytealign (w[ 0], w[ 1], offset); + w[24] = amd_bytealign ( 0, w[ 0], offset); + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 25: + w[63] = amd_bytealign (w[37], w[38], offset); + w[62] = amd_bytealign (w[36], w[37], offset); + w[61] = amd_bytealign (w[35], w[36], offset); + w[60] = amd_bytealign (w[34], w[35], offset); + w[59] = amd_bytealign (w[33], w[34], offset); + w[58] = amd_bytealign (w[32], w[33], offset); + w[57] = amd_bytealign (w[31], w[32], offset); + w[56] = amd_bytealign (w[30], w[31], offset); + w[55] = amd_bytealign (w[29], w[30], offset); + w[54] = amd_bytealign (w[28], w[29], offset); + w[53] = amd_bytealign (w[27], w[28], offset); + w[52] = amd_bytealign (w[26], w[27], offset); + w[51] = amd_bytealign (w[25], w[26], offset); + w[50] = amd_bytealign (w[24], w[25], offset); + w[49] = amd_bytealign (w[23], w[24], offset); + w[48] = amd_bytealign (w[22], w[23], offset); + w[47] = amd_bytealign (w[21], w[22], offset); + w[46] = amd_bytealign (w[20], w[21], offset); + w[45] = amd_bytealign (w[19], w[20], offset); + w[44] = amd_bytealign (w[18], w[19], offset); + w[43] = amd_bytealign (w[17], w[18], offset); + w[42] = amd_bytealign (w[16], w[17], offset); + w[41] = amd_bytealign (w[15], w[16], offset); + w[40] = amd_bytealign (w[14], w[15], offset); + w[39] = amd_bytealign (w[13], w[14], offset); + w[38] = amd_bytealign (w[12], w[13], offset); + w[37] = amd_bytealign (w[11], w[12], offset); + w[36] = amd_bytealign (w[10], w[11], offset); + w[35] = amd_bytealign (w[ 9], w[10], offset); + w[34] = amd_bytealign (w[ 8], w[ 9], offset); + w[33] = amd_bytealign (w[ 7], w[ 8], offset); + w[32] = amd_bytealign (w[ 6], w[ 7], offset); + w[31] = amd_bytealign (w[ 5], w[ 6], offset); + w[30] = amd_bytealign (w[ 4], w[ 5], offset); + w[29] = amd_bytealign (w[ 3], w[ 4], offset); + w[28] = amd_bytealign (w[ 2], w[ 3], offset); + w[27] = amd_bytealign (w[ 1], w[ 2], offset); + w[26] = amd_bytealign (w[ 0], w[ 1], offset); + w[25] = amd_bytealign ( 0, w[ 0], offset); + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 26: + w[63] = amd_bytealign (w[36], w[37], offset); + w[62] = amd_bytealign (w[35], w[36], offset); + w[61] = amd_bytealign (w[34], w[35], offset); + w[60] = amd_bytealign (w[33], w[34], offset); + w[59] = amd_bytealign (w[32], w[33], offset); + w[58] = amd_bytealign (w[31], w[32], offset); + w[57] = amd_bytealign (w[30], w[31], offset); + w[56] = amd_bytealign (w[29], w[30], offset); + w[55] = amd_bytealign (w[28], w[29], offset); + w[54] = amd_bytealign (w[27], w[28], offset); + w[53] = amd_bytealign (w[26], w[27], offset); + w[52] = amd_bytealign (w[25], w[26], offset); + w[51] = amd_bytealign (w[24], w[25], offset); + w[50] = amd_bytealign (w[23], w[24], offset); + w[49] = amd_bytealign (w[22], w[23], offset); + w[48] = amd_bytealign (w[21], w[22], offset); + w[47] = amd_bytealign (w[20], w[21], offset); + w[46] = amd_bytealign (w[19], w[20], offset); + w[45] = amd_bytealign (w[18], w[19], offset); + w[44] = amd_bytealign (w[17], w[18], offset); + w[43] = amd_bytealign (w[16], w[17], offset); + w[42] = amd_bytealign (w[15], w[16], offset); + w[41] = amd_bytealign (w[14], w[15], offset); + w[40] = amd_bytealign (w[13], w[14], offset); + w[39] = amd_bytealign (w[12], w[13], offset); + w[38] = amd_bytealign (w[11], w[12], offset); + w[37] = amd_bytealign (w[10], w[11], offset); + w[36] = amd_bytealign (w[ 9], w[10], offset); + w[35] = amd_bytealign (w[ 8], w[ 9], offset); + w[34] = amd_bytealign (w[ 7], w[ 8], offset); + w[33] = amd_bytealign (w[ 6], w[ 7], offset); + w[32] = amd_bytealign (w[ 5], w[ 6], offset); + w[31] = amd_bytealign (w[ 4], w[ 5], offset); + w[30] = amd_bytealign (w[ 3], w[ 4], offset); + w[29] = amd_bytealign (w[ 2], w[ 3], offset); + w[28] = amd_bytealign (w[ 1], w[ 2], offset); + w[27] = amd_bytealign (w[ 0], w[ 1], offset); + w[26] = amd_bytealign ( 0, w[ 0], offset); + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 27: + w[63] = amd_bytealign (w[35], w[36], offset); + w[62] = amd_bytealign (w[34], w[35], offset); + w[61] = amd_bytealign (w[33], w[34], offset); + w[60] = amd_bytealign (w[32], w[33], offset); + w[59] = amd_bytealign (w[31], w[32], offset); + w[58] = amd_bytealign (w[30], w[31], offset); + w[57] = amd_bytealign (w[29], w[30], offset); + w[56] = amd_bytealign (w[28], w[29], offset); + w[55] = amd_bytealign (w[27], w[28], offset); + w[54] = amd_bytealign (w[26], w[27], offset); + w[53] = amd_bytealign (w[25], w[26], offset); + w[52] = amd_bytealign (w[24], w[25], offset); + w[51] = amd_bytealign (w[23], w[24], offset); + w[50] = amd_bytealign (w[22], w[23], offset); + w[49] = amd_bytealign (w[21], w[22], offset); + w[48] = amd_bytealign (w[20], w[21], offset); + w[47] = amd_bytealign (w[19], w[20], offset); + w[46] = amd_bytealign (w[18], w[19], offset); + w[45] = amd_bytealign (w[17], w[18], offset); + w[44] = amd_bytealign (w[16], w[17], offset); + w[43] = amd_bytealign (w[15], w[16], offset); + w[42] = amd_bytealign (w[14], w[15], offset); + w[41] = amd_bytealign (w[13], w[14], offset); + w[40] = amd_bytealign (w[12], w[13], offset); + w[39] = amd_bytealign (w[11], w[12], offset); + w[38] = amd_bytealign (w[10], w[11], offset); + w[37] = amd_bytealign (w[ 9], w[10], offset); + w[36] = amd_bytealign (w[ 8], w[ 9], offset); + w[35] = amd_bytealign (w[ 7], w[ 8], offset); + w[34] = amd_bytealign (w[ 6], w[ 7], offset); + w[33] = amd_bytealign (w[ 5], w[ 6], offset); + w[32] = amd_bytealign (w[ 4], w[ 5], offset); + w[31] = amd_bytealign (w[ 3], w[ 4], offset); + w[30] = amd_bytealign (w[ 2], w[ 3], offset); + w[29] = amd_bytealign (w[ 1], w[ 2], offset); + w[28] = amd_bytealign (w[ 0], w[ 1], offset); + w[27] = amd_bytealign ( 0, w[ 0], offset); + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 28: + w[63] = amd_bytealign (w[34], w[35], offset); + w[62] = amd_bytealign (w[33], w[34], offset); + w[61] = amd_bytealign (w[32], w[33], offset); + w[60] = amd_bytealign (w[31], w[32], offset); + w[59] = amd_bytealign (w[30], w[31], offset); + w[58] = amd_bytealign (w[29], w[30], offset); + w[57] = amd_bytealign (w[28], w[29], offset); + w[56] = amd_bytealign (w[27], w[28], offset); + w[55] = amd_bytealign (w[26], w[27], offset); + w[54] = amd_bytealign (w[25], w[26], offset); + w[53] = amd_bytealign (w[24], w[25], offset); + w[52] = amd_bytealign (w[23], w[24], offset); + w[51] = amd_bytealign (w[22], w[23], offset); + w[50] = amd_bytealign (w[21], w[22], offset); + w[49] = amd_bytealign (w[20], w[21], offset); + w[48] = amd_bytealign (w[19], w[20], offset); + w[47] = amd_bytealign (w[18], w[19], offset); + w[46] = amd_bytealign (w[17], w[18], offset); + w[45] = amd_bytealign (w[16], w[17], offset); + w[44] = amd_bytealign (w[15], w[16], offset); + w[43] = amd_bytealign (w[14], w[15], offset); + w[42] = amd_bytealign (w[13], w[14], offset); + w[41] = amd_bytealign (w[12], w[13], offset); + w[40] = amd_bytealign (w[11], w[12], offset); + w[39] = amd_bytealign (w[10], w[11], offset); + w[38] = amd_bytealign (w[ 9], w[10], offset); + w[37] = amd_bytealign (w[ 8], w[ 9], offset); + w[36] = amd_bytealign (w[ 7], w[ 8], offset); + w[35] = amd_bytealign (w[ 6], w[ 7], offset); + w[34] = amd_bytealign (w[ 5], w[ 6], offset); + w[33] = amd_bytealign (w[ 4], w[ 5], offset); + w[32] = amd_bytealign (w[ 3], w[ 4], offset); + w[31] = amd_bytealign (w[ 2], w[ 3], offset); + w[30] = amd_bytealign (w[ 1], w[ 2], offset); + w[29] = amd_bytealign (w[ 0], w[ 1], offset); + w[28] = amd_bytealign ( 0, w[ 0], offset); + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 29: + w[63] = amd_bytealign (w[33], w[34], offset); + w[62] = amd_bytealign (w[32], w[33], offset); + w[61] = amd_bytealign (w[31], w[32], offset); + w[60] = amd_bytealign (w[30], w[31], offset); + w[59] = amd_bytealign (w[29], w[30], offset); + w[58] = amd_bytealign (w[28], w[29], offset); + w[57] = amd_bytealign (w[27], w[28], offset); + w[56] = amd_bytealign (w[26], w[27], offset); + w[55] = amd_bytealign (w[25], w[26], offset); + w[54] = amd_bytealign (w[24], w[25], offset); + w[53] = amd_bytealign (w[23], w[24], offset); + w[52] = amd_bytealign (w[22], w[23], offset); + w[51] = amd_bytealign (w[21], w[22], offset); + w[50] = amd_bytealign (w[20], w[21], offset); + w[49] = amd_bytealign (w[19], w[20], offset); + w[48] = amd_bytealign (w[18], w[19], offset); + w[47] = amd_bytealign (w[17], w[18], offset); + w[46] = amd_bytealign (w[16], w[17], offset); + w[45] = amd_bytealign (w[15], w[16], offset); + w[44] = amd_bytealign (w[14], w[15], offset); + w[43] = amd_bytealign (w[13], w[14], offset); + w[42] = amd_bytealign (w[12], w[13], offset); + w[41] = amd_bytealign (w[11], w[12], offset); + w[40] = amd_bytealign (w[10], w[11], offset); + w[39] = amd_bytealign (w[ 9], w[10], offset); + w[38] = amd_bytealign (w[ 8], w[ 9], offset); + w[37] = amd_bytealign (w[ 7], w[ 8], offset); + w[36] = amd_bytealign (w[ 6], w[ 7], offset); + w[35] = amd_bytealign (w[ 5], w[ 6], offset); + w[34] = amd_bytealign (w[ 4], w[ 5], offset); + w[33] = amd_bytealign (w[ 3], w[ 4], offset); + w[32] = amd_bytealign (w[ 2], w[ 3], offset); + w[31] = amd_bytealign (w[ 1], w[ 2], offset); + w[30] = amd_bytealign (w[ 0], w[ 1], offset); + w[29] = amd_bytealign ( 0, w[ 0], offset); + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 30: + w[63] = amd_bytealign (w[32], w[33], offset); + w[62] = amd_bytealign (w[31], w[32], offset); + w[61] = amd_bytealign (w[30], w[31], offset); + w[60] = amd_bytealign (w[29], w[30], offset); + w[59] = amd_bytealign (w[28], w[29], offset); + w[58] = amd_bytealign (w[27], w[28], offset); + w[57] = amd_bytealign (w[26], w[27], offset); + w[56] = amd_bytealign (w[25], w[26], offset); + w[55] = amd_bytealign (w[24], w[25], offset); + w[54] = amd_bytealign (w[23], w[24], offset); + w[53] = amd_bytealign (w[22], w[23], offset); + w[52] = amd_bytealign (w[21], w[22], offset); + w[51] = amd_bytealign (w[20], w[21], offset); + w[50] = amd_bytealign (w[19], w[20], offset); + w[49] = amd_bytealign (w[18], w[19], offset); + w[48] = amd_bytealign (w[17], w[18], offset); + w[47] = amd_bytealign (w[16], w[17], offset); + w[46] = amd_bytealign (w[15], w[16], offset); + w[45] = amd_bytealign (w[14], w[15], offset); + w[44] = amd_bytealign (w[13], w[14], offset); + w[43] = amd_bytealign (w[12], w[13], offset); + w[42] = amd_bytealign (w[11], w[12], offset); + w[41] = amd_bytealign (w[10], w[11], offset); + w[40] = amd_bytealign (w[ 9], w[10], offset); + w[39] = amd_bytealign (w[ 8], w[ 9], offset); + w[38] = amd_bytealign (w[ 7], w[ 8], offset); + w[37] = amd_bytealign (w[ 6], w[ 7], offset); + w[36] = amd_bytealign (w[ 5], w[ 6], offset); + w[35] = amd_bytealign (w[ 4], w[ 5], offset); + w[34] = amd_bytealign (w[ 3], w[ 4], offset); + w[33] = amd_bytealign (w[ 2], w[ 3], offset); + w[32] = amd_bytealign (w[ 1], w[ 2], offset); + w[31] = amd_bytealign (w[ 0], w[ 1], offset); + w[30] = amd_bytealign ( 0, w[ 0], offset); + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 31: + w[63] = amd_bytealign (w[31], w[32], offset); + w[62] = amd_bytealign (w[30], w[31], offset); + w[61] = amd_bytealign (w[29], w[30], offset); + w[60] = amd_bytealign (w[28], w[29], offset); + w[59] = amd_bytealign (w[27], w[28], offset); + w[58] = amd_bytealign (w[26], w[27], offset); + w[57] = amd_bytealign (w[25], w[26], offset); + w[56] = amd_bytealign (w[24], w[25], offset); + w[55] = amd_bytealign (w[23], w[24], offset); + w[54] = amd_bytealign (w[22], w[23], offset); + w[53] = amd_bytealign (w[21], w[22], offset); + w[52] = amd_bytealign (w[20], w[21], offset); + w[51] = amd_bytealign (w[19], w[20], offset); + w[50] = amd_bytealign (w[18], w[19], offset); + w[49] = amd_bytealign (w[17], w[18], offset); + w[48] = amd_bytealign (w[16], w[17], offset); + w[47] = amd_bytealign (w[15], w[16], offset); + w[46] = amd_bytealign (w[14], w[15], offset); + w[45] = amd_bytealign (w[13], w[14], offset); + w[44] = amd_bytealign (w[12], w[13], offset); + w[43] = amd_bytealign (w[11], w[12], offset); + w[42] = amd_bytealign (w[10], w[11], offset); + w[41] = amd_bytealign (w[ 9], w[10], offset); + w[40] = amd_bytealign (w[ 8], w[ 9], offset); + w[39] = amd_bytealign (w[ 7], w[ 8], offset); + w[38] = amd_bytealign (w[ 6], w[ 7], offset); + w[37] = amd_bytealign (w[ 5], w[ 6], offset); + w[36] = amd_bytealign (w[ 4], w[ 5], offset); + w[35] = amd_bytealign (w[ 3], w[ 4], offset); + w[34] = amd_bytealign (w[ 2], w[ 3], offset); + w[33] = amd_bytealign (w[ 1], w[ 2], offset); + w[32] = amd_bytealign (w[ 0], w[ 1], offset); + w[31] = amd_bytealign ( 0, w[ 0], offset); + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 32: + w[63] = amd_bytealign (w[30], w[31], offset); + w[62] = amd_bytealign (w[29], w[30], offset); + w[61] = amd_bytealign (w[28], w[29], offset); + w[60] = amd_bytealign (w[27], w[28], offset); + w[59] = amd_bytealign (w[26], w[27], offset); + w[58] = amd_bytealign (w[25], w[26], offset); + w[57] = amd_bytealign (w[24], w[25], offset); + w[56] = amd_bytealign (w[23], w[24], offset); + w[55] = amd_bytealign (w[22], w[23], offset); + w[54] = amd_bytealign (w[21], w[22], offset); + w[53] = amd_bytealign (w[20], w[21], offset); + w[52] = amd_bytealign (w[19], w[20], offset); + w[51] = amd_bytealign (w[18], w[19], offset); + w[50] = amd_bytealign (w[17], w[18], offset); + w[49] = amd_bytealign (w[16], w[17], offset); + w[48] = amd_bytealign (w[15], w[16], offset); + w[47] = amd_bytealign (w[14], w[15], offset); + w[46] = amd_bytealign (w[13], w[14], offset); + w[45] = amd_bytealign (w[12], w[13], offset); + w[44] = amd_bytealign (w[11], w[12], offset); + w[43] = amd_bytealign (w[10], w[11], offset); + w[42] = amd_bytealign (w[ 9], w[10], offset); + w[41] = amd_bytealign (w[ 8], w[ 9], offset); + w[40] = amd_bytealign (w[ 7], w[ 8], offset); + w[39] = amd_bytealign (w[ 6], w[ 7], offset); + w[38] = amd_bytealign (w[ 5], w[ 6], offset); + w[37] = amd_bytealign (w[ 4], w[ 5], offset); + w[36] = amd_bytealign (w[ 3], w[ 4], offset); + w[35] = amd_bytealign (w[ 2], w[ 3], offset); + w[34] = amd_bytealign (w[ 1], w[ 2], offset); + w[33] = amd_bytealign (w[ 0], w[ 1], offset); + w[32] = amd_bytealign ( 0, w[ 0], offset); + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 33: + w[63] = amd_bytealign (w[29], w[30], offset); + w[62] = amd_bytealign (w[28], w[29], offset); + w[61] = amd_bytealign (w[27], w[28], offset); + w[60] = amd_bytealign (w[26], w[27], offset); + w[59] = amd_bytealign (w[25], w[26], offset); + w[58] = amd_bytealign (w[24], w[25], offset); + w[57] = amd_bytealign (w[23], w[24], offset); + w[56] = amd_bytealign (w[22], w[23], offset); + w[55] = amd_bytealign (w[21], w[22], offset); + w[54] = amd_bytealign (w[20], w[21], offset); + w[53] = amd_bytealign (w[19], w[20], offset); + w[52] = amd_bytealign (w[18], w[19], offset); + w[51] = amd_bytealign (w[17], w[18], offset); + w[50] = amd_bytealign (w[16], w[17], offset); + w[49] = amd_bytealign (w[15], w[16], offset); + w[48] = amd_bytealign (w[14], w[15], offset); + w[47] = amd_bytealign (w[13], w[14], offset); + w[46] = amd_bytealign (w[12], w[13], offset); + w[45] = amd_bytealign (w[11], w[12], offset); + w[44] = amd_bytealign (w[10], w[11], offset); + w[43] = amd_bytealign (w[ 9], w[10], offset); + w[42] = amd_bytealign (w[ 8], w[ 9], offset); + w[41] = amd_bytealign (w[ 7], w[ 8], offset); + w[40] = amd_bytealign (w[ 6], w[ 7], offset); + w[39] = amd_bytealign (w[ 5], w[ 6], offset); + w[38] = amd_bytealign (w[ 4], w[ 5], offset); + w[37] = amd_bytealign (w[ 3], w[ 4], offset); + w[36] = amd_bytealign (w[ 2], w[ 3], offset); + w[35] = amd_bytealign (w[ 1], w[ 2], offset); + w[34] = amd_bytealign (w[ 0], w[ 1], offset); + w[33] = amd_bytealign ( 0, w[ 0], offset); + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 34: + w[63] = amd_bytealign (w[28], w[29], offset); + w[62] = amd_bytealign (w[27], w[28], offset); + w[61] = amd_bytealign (w[26], w[27], offset); + w[60] = amd_bytealign (w[25], w[26], offset); + w[59] = amd_bytealign (w[24], w[25], offset); + w[58] = amd_bytealign (w[23], w[24], offset); + w[57] = amd_bytealign (w[22], w[23], offset); + w[56] = amd_bytealign (w[21], w[22], offset); + w[55] = amd_bytealign (w[20], w[21], offset); + w[54] = amd_bytealign (w[19], w[20], offset); + w[53] = amd_bytealign (w[18], w[19], offset); + w[52] = amd_bytealign (w[17], w[18], offset); + w[51] = amd_bytealign (w[16], w[17], offset); + w[50] = amd_bytealign (w[15], w[16], offset); + w[49] = amd_bytealign (w[14], w[15], offset); + w[48] = amd_bytealign (w[13], w[14], offset); + w[47] = amd_bytealign (w[12], w[13], offset); + w[46] = amd_bytealign (w[11], w[12], offset); + w[45] = amd_bytealign (w[10], w[11], offset); + w[44] = amd_bytealign (w[ 9], w[10], offset); + w[43] = amd_bytealign (w[ 8], w[ 9], offset); + w[42] = amd_bytealign (w[ 7], w[ 8], offset); + w[41] = amd_bytealign (w[ 6], w[ 7], offset); + w[40] = amd_bytealign (w[ 5], w[ 6], offset); + w[39] = amd_bytealign (w[ 4], w[ 5], offset); + w[38] = amd_bytealign (w[ 3], w[ 4], offset); + w[37] = amd_bytealign (w[ 2], w[ 3], offset); + w[36] = amd_bytealign (w[ 1], w[ 2], offset); + w[35] = amd_bytealign (w[ 0], w[ 1], offset); + w[34] = amd_bytealign ( 0, w[ 0], offset); + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 35: + w[63] = amd_bytealign (w[27], w[28], offset); + w[62] = amd_bytealign (w[26], w[27], offset); + w[61] = amd_bytealign (w[25], w[26], offset); + w[60] = amd_bytealign (w[24], w[25], offset); + w[59] = amd_bytealign (w[23], w[24], offset); + w[58] = amd_bytealign (w[22], w[23], offset); + w[57] = amd_bytealign (w[21], w[22], offset); + w[56] = amd_bytealign (w[20], w[21], offset); + w[55] = amd_bytealign (w[19], w[20], offset); + w[54] = amd_bytealign (w[18], w[19], offset); + w[53] = amd_bytealign (w[17], w[18], offset); + w[52] = amd_bytealign (w[16], w[17], offset); + w[51] = amd_bytealign (w[15], w[16], offset); + w[50] = amd_bytealign (w[14], w[15], offset); + w[49] = amd_bytealign (w[13], w[14], offset); + w[48] = amd_bytealign (w[12], w[13], offset); + w[47] = amd_bytealign (w[11], w[12], offset); + w[46] = amd_bytealign (w[10], w[11], offset); + w[45] = amd_bytealign (w[ 9], w[10], offset); + w[44] = amd_bytealign (w[ 8], w[ 9], offset); + w[43] = amd_bytealign (w[ 7], w[ 8], offset); + w[42] = amd_bytealign (w[ 6], w[ 7], offset); + w[41] = amd_bytealign (w[ 5], w[ 6], offset); + w[40] = amd_bytealign (w[ 4], w[ 5], offset); + w[39] = amd_bytealign (w[ 3], w[ 4], offset); + w[38] = amd_bytealign (w[ 2], w[ 3], offset); + w[37] = amd_bytealign (w[ 1], w[ 2], offset); + w[36] = amd_bytealign (w[ 0], w[ 1], offset); + w[35] = amd_bytealign ( 0, w[ 0], offset); + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 36: + w[63] = amd_bytealign (w[26], w[27], offset); + w[62] = amd_bytealign (w[25], w[26], offset); + w[61] = amd_bytealign (w[24], w[25], offset); + w[60] = amd_bytealign (w[23], w[24], offset); + w[59] = amd_bytealign (w[22], w[23], offset); + w[58] = amd_bytealign (w[21], w[22], offset); + w[57] = amd_bytealign (w[20], w[21], offset); + w[56] = amd_bytealign (w[19], w[20], offset); + w[55] = amd_bytealign (w[18], w[19], offset); + w[54] = amd_bytealign (w[17], w[18], offset); + w[53] = amd_bytealign (w[16], w[17], offset); + w[52] = amd_bytealign (w[15], w[16], offset); + w[51] = amd_bytealign (w[14], w[15], offset); + w[50] = amd_bytealign (w[13], w[14], offset); + w[49] = amd_bytealign (w[12], w[13], offset); + w[48] = amd_bytealign (w[11], w[12], offset); + w[47] = amd_bytealign (w[10], w[11], offset); + w[46] = amd_bytealign (w[ 9], w[10], offset); + w[45] = amd_bytealign (w[ 8], w[ 9], offset); + w[44] = amd_bytealign (w[ 7], w[ 8], offset); + w[43] = amd_bytealign (w[ 6], w[ 7], offset); + w[42] = amd_bytealign (w[ 5], w[ 6], offset); + w[41] = amd_bytealign (w[ 4], w[ 5], offset); + w[40] = amd_bytealign (w[ 3], w[ 4], offset); + w[39] = amd_bytealign (w[ 2], w[ 3], offset); + w[38] = amd_bytealign (w[ 1], w[ 2], offset); + w[37] = amd_bytealign (w[ 0], w[ 1], offset); + w[36] = amd_bytealign ( 0, w[ 0], offset); + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 37: + w[63] = amd_bytealign (w[25], w[26], offset); + w[62] = amd_bytealign (w[24], w[25], offset); + w[61] = amd_bytealign (w[23], w[24], offset); + w[60] = amd_bytealign (w[22], w[23], offset); + w[59] = amd_bytealign (w[21], w[22], offset); + w[58] = amd_bytealign (w[20], w[21], offset); + w[57] = amd_bytealign (w[19], w[20], offset); + w[56] = amd_bytealign (w[18], w[19], offset); + w[55] = amd_bytealign (w[17], w[18], offset); + w[54] = amd_bytealign (w[16], w[17], offset); + w[53] = amd_bytealign (w[15], w[16], offset); + w[52] = amd_bytealign (w[14], w[15], offset); + w[51] = amd_bytealign (w[13], w[14], offset); + w[50] = amd_bytealign (w[12], w[13], offset); + w[49] = amd_bytealign (w[11], w[12], offset); + w[48] = amd_bytealign (w[10], w[11], offset); + w[47] = amd_bytealign (w[ 9], w[10], offset); + w[46] = amd_bytealign (w[ 8], w[ 9], offset); + w[45] = amd_bytealign (w[ 7], w[ 8], offset); + w[44] = amd_bytealign (w[ 6], w[ 7], offset); + w[43] = amd_bytealign (w[ 5], w[ 6], offset); + w[42] = amd_bytealign (w[ 4], w[ 5], offset); + w[41] = amd_bytealign (w[ 3], w[ 4], offset); + w[40] = amd_bytealign (w[ 2], w[ 3], offset); + w[39] = amd_bytealign (w[ 1], w[ 2], offset); + w[38] = amd_bytealign (w[ 0], w[ 1], offset); + w[37] = amd_bytealign ( 0, w[ 0], offset); + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 38: + w[63] = amd_bytealign (w[24], w[25], offset); + w[62] = amd_bytealign (w[23], w[24], offset); + w[61] = amd_bytealign (w[22], w[23], offset); + w[60] = amd_bytealign (w[21], w[22], offset); + w[59] = amd_bytealign (w[20], w[21], offset); + w[58] = amd_bytealign (w[19], w[20], offset); + w[57] = amd_bytealign (w[18], w[19], offset); + w[56] = amd_bytealign (w[17], w[18], offset); + w[55] = amd_bytealign (w[16], w[17], offset); + w[54] = amd_bytealign (w[15], w[16], offset); + w[53] = amd_bytealign (w[14], w[15], offset); + w[52] = amd_bytealign (w[13], w[14], offset); + w[51] = amd_bytealign (w[12], w[13], offset); + w[50] = amd_bytealign (w[11], w[12], offset); + w[49] = amd_bytealign (w[10], w[11], offset); + w[48] = amd_bytealign (w[ 9], w[10], offset); + w[47] = amd_bytealign (w[ 8], w[ 9], offset); + w[46] = amd_bytealign (w[ 7], w[ 8], offset); + w[45] = amd_bytealign (w[ 6], w[ 7], offset); + w[44] = amd_bytealign (w[ 5], w[ 6], offset); + w[43] = amd_bytealign (w[ 4], w[ 5], offset); + w[42] = amd_bytealign (w[ 3], w[ 4], offset); + w[41] = amd_bytealign (w[ 2], w[ 3], offset); + w[40] = amd_bytealign (w[ 1], w[ 2], offset); + w[39] = amd_bytealign (w[ 0], w[ 1], offset); + w[38] = amd_bytealign ( 0, w[ 0], offset); + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 39: + w[63] = amd_bytealign (w[23], w[24], offset); + w[62] = amd_bytealign (w[22], w[23], offset); + w[61] = amd_bytealign (w[21], w[22], offset); + w[60] = amd_bytealign (w[20], w[21], offset); + w[59] = amd_bytealign (w[19], w[20], offset); + w[58] = amd_bytealign (w[18], w[19], offset); + w[57] = amd_bytealign (w[17], w[18], offset); + w[56] = amd_bytealign (w[16], w[17], offset); + w[55] = amd_bytealign (w[15], w[16], offset); + w[54] = amd_bytealign (w[14], w[15], offset); + w[53] = amd_bytealign (w[13], w[14], offset); + w[52] = amd_bytealign (w[12], w[13], offset); + w[51] = amd_bytealign (w[11], w[12], offset); + w[50] = amd_bytealign (w[10], w[11], offset); + w[49] = amd_bytealign (w[ 9], w[10], offset); + w[48] = amd_bytealign (w[ 8], w[ 9], offset); + w[47] = amd_bytealign (w[ 7], w[ 8], offset); + w[46] = amd_bytealign (w[ 6], w[ 7], offset); + w[45] = amd_bytealign (w[ 5], w[ 6], offset); + w[44] = amd_bytealign (w[ 4], w[ 5], offset); + w[43] = amd_bytealign (w[ 3], w[ 4], offset); + w[42] = amd_bytealign (w[ 2], w[ 3], offset); + w[41] = amd_bytealign (w[ 1], w[ 2], offset); + w[40] = amd_bytealign (w[ 0], w[ 1], offset); + w[39] = amd_bytealign ( 0, w[ 0], offset); + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 40: + w[63] = amd_bytealign (w[22], w[23], offset); + w[62] = amd_bytealign (w[21], w[22], offset); + w[61] = amd_bytealign (w[20], w[21], offset); + w[60] = amd_bytealign (w[19], w[20], offset); + w[59] = amd_bytealign (w[18], w[19], offset); + w[58] = amd_bytealign (w[17], w[18], offset); + w[57] = amd_bytealign (w[16], w[17], offset); + w[56] = amd_bytealign (w[15], w[16], offset); + w[55] = amd_bytealign (w[14], w[15], offset); + w[54] = amd_bytealign (w[13], w[14], offset); + w[53] = amd_bytealign (w[12], w[13], offset); + w[52] = amd_bytealign (w[11], w[12], offset); + w[51] = amd_bytealign (w[10], w[11], offset); + w[50] = amd_bytealign (w[ 9], w[10], offset); + w[49] = amd_bytealign (w[ 8], w[ 9], offset); + w[48] = amd_bytealign (w[ 7], w[ 8], offset); + w[47] = amd_bytealign (w[ 6], w[ 7], offset); + w[46] = amd_bytealign (w[ 5], w[ 6], offset); + w[45] = amd_bytealign (w[ 4], w[ 5], offset); + w[44] = amd_bytealign (w[ 3], w[ 4], offset); + w[43] = amd_bytealign (w[ 2], w[ 3], offset); + w[42] = amd_bytealign (w[ 1], w[ 2], offset); + w[41] = amd_bytealign (w[ 0], w[ 1], offset); + w[40] = amd_bytealign ( 0, w[ 0], offset); + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 41: + w[63] = amd_bytealign (w[21], w[22], offset); + w[62] = amd_bytealign (w[20], w[21], offset); + w[61] = amd_bytealign (w[19], w[20], offset); + w[60] = amd_bytealign (w[18], w[19], offset); + w[59] = amd_bytealign (w[17], w[18], offset); + w[58] = amd_bytealign (w[16], w[17], offset); + w[57] = amd_bytealign (w[15], w[16], offset); + w[56] = amd_bytealign (w[14], w[15], offset); + w[55] = amd_bytealign (w[13], w[14], offset); + w[54] = amd_bytealign (w[12], w[13], offset); + w[53] = amd_bytealign (w[11], w[12], offset); + w[52] = amd_bytealign (w[10], w[11], offset); + w[51] = amd_bytealign (w[ 9], w[10], offset); + w[50] = amd_bytealign (w[ 8], w[ 9], offset); + w[49] = amd_bytealign (w[ 7], w[ 8], offset); + w[48] = amd_bytealign (w[ 6], w[ 7], offset); + w[47] = amd_bytealign (w[ 5], w[ 6], offset); + w[46] = amd_bytealign (w[ 4], w[ 5], offset); + w[45] = amd_bytealign (w[ 3], w[ 4], offset); + w[44] = amd_bytealign (w[ 2], w[ 3], offset); + w[43] = amd_bytealign (w[ 1], w[ 2], offset); + w[42] = amd_bytealign (w[ 0], w[ 1], offset); + w[41] = amd_bytealign ( 0, w[ 0], offset); + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 42: + w[63] = amd_bytealign (w[20], w[21], offset); + w[62] = amd_bytealign (w[19], w[20], offset); + w[61] = amd_bytealign (w[18], w[19], offset); + w[60] = amd_bytealign (w[17], w[18], offset); + w[59] = amd_bytealign (w[16], w[17], offset); + w[58] = amd_bytealign (w[15], w[16], offset); + w[57] = amd_bytealign (w[14], w[15], offset); + w[56] = amd_bytealign (w[13], w[14], offset); + w[55] = amd_bytealign (w[12], w[13], offset); + w[54] = amd_bytealign (w[11], w[12], offset); + w[53] = amd_bytealign (w[10], w[11], offset); + w[52] = amd_bytealign (w[ 9], w[10], offset); + w[51] = amd_bytealign (w[ 8], w[ 9], offset); + w[50] = amd_bytealign (w[ 7], w[ 8], offset); + w[49] = amd_bytealign (w[ 6], w[ 7], offset); + w[48] = amd_bytealign (w[ 5], w[ 6], offset); + w[47] = amd_bytealign (w[ 4], w[ 5], offset); + w[46] = amd_bytealign (w[ 3], w[ 4], offset); + w[45] = amd_bytealign (w[ 2], w[ 3], offset); + w[44] = amd_bytealign (w[ 1], w[ 2], offset); + w[43] = amd_bytealign (w[ 0], w[ 1], offset); + w[42] = amd_bytealign ( 0, w[ 0], offset); + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 43: + w[63] = amd_bytealign (w[19], w[20], offset); + w[62] = amd_bytealign (w[18], w[19], offset); + w[61] = amd_bytealign (w[17], w[18], offset); + w[60] = amd_bytealign (w[16], w[17], offset); + w[59] = amd_bytealign (w[15], w[16], offset); + w[58] = amd_bytealign (w[14], w[15], offset); + w[57] = amd_bytealign (w[13], w[14], offset); + w[56] = amd_bytealign (w[12], w[13], offset); + w[55] = amd_bytealign (w[11], w[12], offset); + w[54] = amd_bytealign (w[10], w[11], offset); + w[53] = amd_bytealign (w[ 9], w[10], offset); + w[52] = amd_bytealign (w[ 8], w[ 9], offset); + w[51] = amd_bytealign (w[ 7], w[ 8], offset); + w[50] = amd_bytealign (w[ 6], w[ 7], offset); + w[49] = amd_bytealign (w[ 5], w[ 6], offset); + w[48] = amd_bytealign (w[ 4], w[ 5], offset); + w[47] = amd_bytealign (w[ 3], w[ 4], offset); + w[46] = amd_bytealign (w[ 2], w[ 3], offset); + w[45] = amd_bytealign (w[ 1], w[ 2], offset); + w[44] = amd_bytealign (w[ 0], w[ 1], offset); + w[43] = amd_bytealign ( 0, w[ 0], offset); + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 44: + w[63] = amd_bytealign (w[18], w[19], offset); + w[62] = amd_bytealign (w[17], w[18], offset); + w[61] = amd_bytealign (w[16], w[17], offset); + w[60] = amd_bytealign (w[15], w[16], offset); + w[59] = amd_bytealign (w[14], w[15], offset); + w[58] = amd_bytealign (w[13], w[14], offset); + w[57] = amd_bytealign (w[12], w[13], offset); + w[56] = amd_bytealign (w[11], w[12], offset); + w[55] = amd_bytealign (w[10], w[11], offset); + w[54] = amd_bytealign (w[ 9], w[10], offset); + w[53] = amd_bytealign (w[ 8], w[ 9], offset); + w[52] = amd_bytealign (w[ 7], w[ 8], offset); + w[51] = amd_bytealign (w[ 6], w[ 7], offset); + w[50] = amd_bytealign (w[ 5], w[ 6], offset); + w[49] = amd_bytealign (w[ 4], w[ 5], offset); + w[48] = amd_bytealign (w[ 3], w[ 4], offset); + w[47] = amd_bytealign (w[ 2], w[ 3], offset); + w[46] = amd_bytealign (w[ 1], w[ 2], offset); + w[45] = amd_bytealign (w[ 0], w[ 1], offset); + w[44] = amd_bytealign ( 0, w[ 0], offset); + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 45: + w[63] = amd_bytealign (w[17], w[18], offset); + w[62] = amd_bytealign (w[16], w[17], offset); + w[61] = amd_bytealign (w[15], w[16], offset); + w[60] = amd_bytealign (w[14], w[15], offset); + w[59] = amd_bytealign (w[13], w[14], offset); + w[58] = amd_bytealign (w[12], w[13], offset); + w[57] = amd_bytealign (w[11], w[12], offset); + w[56] = amd_bytealign (w[10], w[11], offset); + w[55] = amd_bytealign (w[ 9], w[10], offset); + w[54] = amd_bytealign (w[ 8], w[ 9], offset); + w[53] = amd_bytealign (w[ 7], w[ 8], offset); + w[52] = amd_bytealign (w[ 6], w[ 7], offset); + w[51] = amd_bytealign (w[ 5], w[ 6], offset); + w[50] = amd_bytealign (w[ 4], w[ 5], offset); + w[49] = amd_bytealign (w[ 3], w[ 4], offset); + w[48] = amd_bytealign (w[ 2], w[ 3], offset); + w[47] = amd_bytealign (w[ 1], w[ 2], offset); + w[46] = amd_bytealign (w[ 0], w[ 1], offset); + w[45] = amd_bytealign ( 0, w[ 0], offset); + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 46: + w[63] = amd_bytealign (w[16], w[17], offset); + w[62] = amd_bytealign (w[15], w[16], offset); + w[61] = amd_bytealign (w[14], w[15], offset); + w[60] = amd_bytealign (w[13], w[14], offset); + w[59] = amd_bytealign (w[12], w[13], offset); + w[58] = amd_bytealign (w[11], w[12], offset); + w[57] = amd_bytealign (w[10], w[11], offset); + w[56] = amd_bytealign (w[ 9], w[10], offset); + w[55] = amd_bytealign (w[ 8], w[ 9], offset); + w[54] = amd_bytealign (w[ 7], w[ 8], offset); + w[53] = amd_bytealign (w[ 6], w[ 7], offset); + w[52] = amd_bytealign (w[ 5], w[ 6], offset); + w[51] = amd_bytealign (w[ 4], w[ 5], offset); + w[50] = amd_bytealign (w[ 3], w[ 4], offset); + w[49] = amd_bytealign (w[ 2], w[ 3], offset); + w[48] = amd_bytealign (w[ 1], w[ 2], offset); + w[47] = amd_bytealign (w[ 0], w[ 1], offset); + w[46] = amd_bytealign ( 0, w[ 0], offset); + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 47: + w[63] = amd_bytealign (w[15], w[16], offset); + w[62] = amd_bytealign (w[14], w[15], offset); + w[61] = amd_bytealign (w[13], w[14], offset); + w[60] = amd_bytealign (w[12], w[13], offset); + w[59] = amd_bytealign (w[11], w[12], offset); + w[58] = amd_bytealign (w[10], w[11], offset); + w[57] = amd_bytealign (w[ 9], w[10], offset); + w[56] = amd_bytealign (w[ 8], w[ 9], offset); + w[55] = amd_bytealign (w[ 7], w[ 8], offset); + w[54] = amd_bytealign (w[ 6], w[ 7], offset); + w[53] = amd_bytealign (w[ 5], w[ 6], offset); + w[52] = amd_bytealign (w[ 4], w[ 5], offset); + w[51] = amd_bytealign (w[ 3], w[ 4], offset); + w[50] = amd_bytealign (w[ 2], w[ 3], offset); + w[49] = amd_bytealign (w[ 1], w[ 2], offset); + w[48] = amd_bytealign (w[ 0], w[ 1], offset); + w[47] = amd_bytealign ( 0, w[ 0], offset); + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 48: + w[63] = amd_bytealign (w[14], w[15], offset); + w[62] = amd_bytealign (w[13], w[14], offset); + w[61] = amd_bytealign (w[12], w[13], offset); + w[60] = amd_bytealign (w[11], w[12], offset); + w[59] = amd_bytealign (w[10], w[11], offset); + w[58] = amd_bytealign (w[ 9], w[10], offset); + w[57] = amd_bytealign (w[ 8], w[ 9], offset); + w[56] = amd_bytealign (w[ 7], w[ 8], offset); + w[55] = amd_bytealign (w[ 6], w[ 7], offset); + w[54] = amd_bytealign (w[ 5], w[ 6], offset); + w[53] = amd_bytealign (w[ 4], w[ 5], offset); + w[52] = amd_bytealign (w[ 3], w[ 4], offset); + w[51] = amd_bytealign (w[ 2], w[ 3], offset); + w[50] = amd_bytealign (w[ 1], w[ 2], offset); + w[49] = amd_bytealign (w[ 0], w[ 1], offset); + w[48] = amd_bytealign ( 0, w[ 0], offset); + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 49: + w[63] = amd_bytealign (w[13], w[14], offset); + w[62] = amd_bytealign (w[12], w[13], offset); + w[61] = amd_bytealign (w[11], w[12], offset); + w[60] = amd_bytealign (w[10], w[11], offset); + w[59] = amd_bytealign (w[ 9], w[10], offset); + w[58] = amd_bytealign (w[ 8], w[ 9], offset); + w[57] = amd_bytealign (w[ 7], w[ 8], offset); + w[56] = amd_bytealign (w[ 6], w[ 7], offset); + w[55] = amd_bytealign (w[ 5], w[ 6], offset); + w[54] = amd_bytealign (w[ 4], w[ 5], offset); + w[53] = amd_bytealign (w[ 3], w[ 4], offset); + w[52] = amd_bytealign (w[ 2], w[ 3], offset); + w[51] = amd_bytealign (w[ 1], w[ 2], offset); + w[50] = amd_bytealign (w[ 0], w[ 1], offset); + w[49] = amd_bytealign ( 0, w[ 0], offset); + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 50: + w[63] = amd_bytealign (w[12], w[13], offset); + w[62] = amd_bytealign (w[11], w[12], offset); + w[61] = amd_bytealign (w[10], w[11], offset); + w[60] = amd_bytealign (w[ 9], w[10], offset); + w[59] = amd_bytealign (w[ 8], w[ 9], offset); + w[58] = amd_bytealign (w[ 7], w[ 8], offset); + w[57] = amd_bytealign (w[ 6], w[ 7], offset); + w[56] = amd_bytealign (w[ 5], w[ 6], offset); + w[55] = amd_bytealign (w[ 4], w[ 5], offset); + w[54] = amd_bytealign (w[ 3], w[ 4], offset); + w[53] = amd_bytealign (w[ 2], w[ 3], offset); + w[52] = amd_bytealign (w[ 1], w[ 2], offset); + w[51] = amd_bytealign (w[ 0], w[ 1], offset); + w[50] = amd_bytealign ( 0, w[ 0], offset); + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 51: + w[63] = amd_bytealign (w[11], w[12], offset); + w[62] = amd_bytealign (w[10], w[11], offset); + w[61] = amd_bytealign (w[ 9], w[10], offset); + w[60] = amd_bytealign (w[ 8], w[ 9], offset); + w[59] = amd_bytealign (w[ 7], w[ 8], offset); + w[58] = amd_bytealign (w[ 6], w[ 7], offset); + w[57] = amd_bytealign (w[ 5], w[ 6], offset); + w[56] = amd_bytealign (w[ 4], w[ 5], offset); + w[55] = amd_bytealign (w[ 3], w[ 4], offset); + w[54] = amd_bytealign (w[ 2], w[ 3], offset); + w[53] = amd_bytealign (w[ 1], w[ 2], offset); + w[52] = amd_bytealign (w[ 0], w[ 1], offset); + w[51] = amd_bytealign ( 0, w[ 0], offset); + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 52: + w[63] = amd_bytealign (w[10], w[11], offset); + w[62] = amd_bytealign (w[ 9], w[10], offset); + w[61] = amd_bytealign (w[ 8], w[ 9], offset); + w[60] = amd_bytealign (w[ 7], w[ 8], offset); + w[59] = amd_bytealign (w[ 6], w[ 7], offset); + w[58] = amd_bytealign (w[ 5], w[ 6], offset); + w[57] = amd_bytealign (w[ 4], w[ 5], offset); + w[56] = amd_bytealign (w[ 3], w[ 4], offset); + w[55] = amd_bytealign (w[ 2], w[ 3], offset); + w[54] = amd_bytealign (w[ 1], w[ 2], offset); + w[53] = amd_bytealign (w[ 0], w[ 1], offset); + w[52] = amd_bytealign ( 0, w[ 0], offset); + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 53: + w[63] = amd_bytealign (w[ 9], w[10], offset); + w[62] = amd_bytealign (w[ 8], w[ 9], offset); + w[61] = amd_bytealign (w[ 7], w[ 8], offset); + w[60] = amd_bytealign (w[ 6], w[ 7], offset); + w[59] = amd_bytealign (w[ 5], w[ 6], offset); + w[58] = amd_bytealign (w[ 4], w[ 5], offset); + w[57] = amd_bytealign (w[ 3], w[ 4], offset); + w[56] = amd_bytealign (w[ 2], w[ 3], offset); + w[55] = amd_bytealign (w[ 1], w[ 2], offset); + w[54] = amd_bytealign (w[ 0], w[ 1], offset); + w[53] = amd_bytealign ( 0, w[ 0], offset); + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 54: + w[63] = amd_bytealign (w[ 8], w[ 9], offset); + w[62] = amd_bytealign (w[ 7], w[ 8], offset); + w[61] = amd_bytealign (w[ 6], w[ 7], offset); + w[60] = amd_bytealign (w[ 5], w[ 6], offset); + w[59] = amd_bytealign (w[ 4], w[ 5], offset); + w[58] = amd_bytealign (w[ 3], w[ 4], offset); + w[57] = amd_bytealign (w[ 2], w[ 3], offset); + w[56] = amd_bytealign (w[ 1], w[ 2], offset); + w[55] = amd_bytealign (w[ 0], w[ 1], offset); + w[54] = amd_bytealign ( 0, w[ 0], offset); + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 55: + w[63] = amd_bytealign (w[ 7], w[ 8], offset); + w[62] = amd_bytealign (w[ 6], w[ 7], offset); + w[61] = amd_bytealign (w[ 5], w[ 6], offset); + w[60] = amd_bytealign (w[ 4], w[ 5], offset); + w[59] = amd_bytealign (w[ 3], w[ 4], offset); + w[58] = amd_bytealign (w[ 2], w[ 3], offset); + w[57] = amd_bytealign (w[ 1], w[ 2], offset); + w[56] = amd_bytealign (w[ 0], w[ 1], offset); + w[55] = amd_bytealign ( 0, w[ 0], offset); + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 56: + w[63] = amd_bytealign (w[ 6], w[ 7], offset); + w[62] = amd_bytealign (w[ 5], w[ 6], offset); + w[61] = amd_bytealign (w[ 4], w[ 5], offset); + w[60] = amd_bytealign (w[ 3], w[ 4], offset); + w[59] = amd_bytealign (w[ 2], w[ 3], offset); + w[58] = amd_bytealign (w[ 1], w[ 2], offset); + w[57] = amd_bytealign (w[ 0], w[ 1], offset); + w[56] = amd_bytealign ( 0, w[ 0], offset); + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 57: + w[63] = amd_bytealign (w[ 5], w[ 6], offset); + w[62] = amd_bytealign (w[ 4], w[ 5], offset); + w[61] = amd_bytealign (w[ 3], w[ 4], offset); + w[60] = amd_bytealign (w[ 2], w[ 3], offset); + w[59] = amd_bytealign (w[ 1], w[ 2], offset); + w[58] = amd_bytealign (w[ 0], w[ 1], offset); + w[57] = amd_bytealign ( 0, w[ 0], offset); + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 58: + w[63] = amd_bytealign (w[ 4], w[ 5], offset); + w[62] = amd_bytealign (w[ 3], w[ 4], offset); + w[61] = amd_bytealign (w[ 2], w[ 3], offset); + w[60] = amd_bytealign (w[ 1], w[ 2], offset); + w[59] = amd_bytealign (w[ 0], w[ 1], offset); + w[58] = amd_bytealign ( 0, w[ 0], offset); + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 59: + w[63] = amd_bytealign (w[ 3], w[ 4], offset); + w[62] = amd_bytealign (w[ 2], w[ 3], offset); + w[61] = amd_bytealign (w[ 1], w[ 2], offset); + w[60] = amd_bytealign (w[ 0], w[ 1], offset); + w[59] = amd_bytealign ( 0, w[ 0], offset); + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 60: + w[63] = amd_bytealign (w[ 2], w[ 3], offset); + w[62] = amd_bytealign (w[ 1], w[ 2], offset); + w[61] = amd_bytealign (w[ 0], w[ 1], offset); + w[60] = amd_bytealign ( 0, w[ 0], offset); + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 61: + w[63] = amd_bytealign (w[ 1], w[ 2], offset); + w[62] = amd_bytealign (w[ 0], w[ 1], offset); + w[61] = amd_bytealign ( 0, w[ 0], offset); + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 62: + w[63] = amd_bytealign (w[ 0], w[ 1], offset); + w[62] = amd_bytealign ( 0, w[ 0], offset); + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 63: + w[63] = amd_bytealign ( 0, w[ 0], offset); + w[62] = 0; + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + } + + #pragma unroll + for (int i = 0; i < 64; i++) w[i] = swap32 (w[i]); + + #endif + + #ifdef IS_NV + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + + switch (offset / 4) + { + case 0: + w[63] = __byte_perm (w[62], w[63], selector); + w[62] = __byte_perm (w[61], w[62], selector); + w[61] = __byte_perm (w[60], w[61], selector); + w[60] = __byte_perm (w[59], w[60], selector); + w[59] = __byte_perm (w[58], w[59], selector); + w[58] = __byte_perm (w[57], w[58], selector); + w[57] = __byte_perm (w[56], w[57], selector); + w[56] = __byte_perm (w[55], w[56], selector); + w[55] = __byte_perm (w[54], w[55], selector); + w[54] = __byte_perm (w[53], w[54], selector); + w[53] = __byte_perm (w[52], w[53], selector); + w[52] = __byte_perm (w[51], w[52], selector); + w[51] = __byte_perm (w[50], w[51], selector); + w[50] = __byte_perm (w[49], w[50], selector); + w[49] = __byte_perm (w[48], w[49], selector); + w[48] = __byte_perm (w[47], w[48], selector); + w[47] = __byte_perm (w[46], w[47], selector); + w[46] = __byte_perm (w[45], w[46], selector); + w[45] = __byte_perm (w[44], w[45], selector); + w[44] = __byte_perm (w[43], w[44], selector); + w[43] = __byte_perm (w[42], w[43], selector); + w[42] = __byte_perm (w[41], w[42], selector); + w[41] = __byte_perm (w[40], w[41], selector); + w[40] = __byte_perm (w[39], w[40], selector); + w[39] = __byte_perm (w[38], w[39], selector); + w[38] = __byte_perm (w[37], w[38], selector); + w[37] = __byte_perm (w[36], w[37], selector); + w[36] = __byte_perm (w[35], w[36], selector); + w[35] = __byte_perm (w[34], w[35], selector); + w[34] = __byte_perm (w[33], w[34], selector); + w[33] = __byte_perm (w[32], w[33], selector); + w[32] = __byte_perm (w[31], w[32], selector); + w[31] = __byte_perm (w[30], w[31], selector); + w[30] = __byte_perm (w[29], w[30], selector); + w[29] = __byte_perm (w[28], w[29], selector); + w[28] = __byte_perm (w[27], w[28], selector); + w[27] = __byte_perm (w[26], w[27], selector); + w[26] = __byte_perm (w[25], w[26], selector); + w[25] = __byte_perm (w[24], w[25], selector); + w[24] = __byte_perm (w[23], w[24], selector); + w[23] = __byte_perm (w[22], w[23], selector); + w[22] = __byte_perm (w[21], w[22], selector); + w[21] = __byte_perm (w[20], w[21], selector); + w[20] = __byte_perm (w[19], w[20], selector); + w[19] = __byte_perm (w[18], w[19], selector); + w[18] = __byte_perm (w[17], w[18], selector); + w[17] = __byte_perm (w[16], w[17], selector); + w[16] = __byte_perm (w[15], w[16], selector); + w[15] = __byte_perm (w[14], w[15], selector); + w[14] = __byte_perm (w[13], w[14], selector); + w[13] = __byte_perm (w[12], w[13], selector); + w[12] = __byte_perm (w[11], w[12], selector); + w[11] = __byte_perm (w[10], w[11], selector); + w[10] = __byte_perm (w[ 9], w[10], selector); + w[ 9] = __byte_perm (w[ 8], w[ 9], selector); + w[ 8] = __byte_perm (w[ 7], w[ 8], selector); + w[ 7] = __byte_perm (w[ 6], w[ 7], selector); + w[ 6] = __byte_perm (w[ 5], w[ 6], selector); + w[ 5] = __byte_perm (w[ 4], w[ 5], selector); + w[ 4] = __byte_perm (w[ 3], w[ 4], selector); + w[ 3] = __byte_perm (w[ 2], w[ 3], selector); + w[ 2] = __byte_perm (w[ 1], w[ 2], selector); + w[ 1] = __byte_perm (w[ 0], w[ 1], selector); + w[ 0] = __byte_perm ( 0, w[ 0], selector); + + break; + + case 1: + w[63] = __byte_perm (w[61], w[62], selector); + w[62] = __byte_perm (w[60], w[61], selector); + w[61] = __byte_perm (w[59], w[60], selector); + w[60] = __byte_perm (w[58], w[59], selector); + w[59] = __byte_perm (w[57], w[58], selector); + w[58] = __byte_perm (w[56], w[57], selector); + w[57] = __byte_perm (w[55], w[56], selector); + w[56] = __byte_perm (w[54], w[55], selector); + w[55] = __byte_perm (w[53], w[54], selector); + w[54] = __byte_perm (w[52], w[53], selector); + w[53] = __byte_perm (w[51], w[52], selector); + w[52] = __byte_perm (w[50], w[51], selector); + w[51] = __byte_perm (w[49], w[50], selector); + w[50] = __byte_perm (w[48], w[49], selector); + w[49] = __byte_perm (w[47], w[48], selector); + w[48] = __byte_perm (w[46], w[47], selector); + w[47] = __byte_perm (w[45], w[46], selector); + w[46] = __byte_perm (w[44], w[45], selector); + w[45] = __byte_perm (w[43], w[44], selector); + w[44] = __byte_perm (w[42], w[43], selector); + w[43] = __byte_perm (w[41], w[42], selector); + w[42] = __byte_perm (w[40], w[41], selector); + w[41] = __byte_perm (w[39], w[40], selector); + w[40] = __byte_perm (w[38], w[39], selector); + w[39] = __byte_perm (w[37], w[38], selector); + w[38] = __byte_perm (w[36], w[37], selector); + w[37] = __byte_perm (w[35], w[36], selector); + w[36] = __byte_perm (w[34], w[35], selector); + w[35] = __byte_perm (w[33], w[34], selector); + w[34] = __byte_perm (w[32], w[33], selector); + w[33] = __byte_perm (w[31], w[32], selector); + w[32] = __byte_perm (w[30], w[31], selector); + w[31] = __byte_perm (w[29], w[30], selector); + w[30] = __byte_perm (w[28], w[29], selector); + w[29] = __byte_perm (w[27], w[28], selector); + w[28] = __byte_perm (w[26], w[27], selector); + w[27] = __byte_perm (w[25], w[26], selector); + w[26] = __byte_perm (w[24], w[25], selector); + w[25] = __byte_perm (w[23], w[24], selector); + w[24] = __byte_perm (w[22], w[23], selector); + w[23] = __byte_perm (w[21], w[22], selector); + w[22] = __byte_perm (w[20], w[21], selector); + w[21] = __byte_perm (w[19], w[20], selector); + w[20] = __byte_perm (w[18], w[19], selector); + w[19] = __byte_perm (w[17], w[18], selector); + w[18] = __byte_perm (w[16], w[17], selector); + w[17] = __byte_perm (w[15], w[16], selector); + w[16] = __byte_perm (w[14], w[15], selector); + w[15] = __byte_perm (w[13], w[14], selector); + w[14] = __byte_perm (w[12], w[13], selector); + w[13] = __byte_perm (w[11], w[12], selector); + w[12] = __byte_perm (w[10], w[11], selector); + w[11] = __byte_perm (w[ 9], w[10], selector); + w[10] = __byte_perm (w[ 8], w[ 9], selector); + w[ 9] = __byte_perm (w[ 7], w[ 8], selector); + w[ 8] = __byte_perm (w[ 6], w[ 7], selector); + w[ 7] = __byte_perm (w[ 5], w[ 6], selector); + w[ 6] = __byte_perm (w[ 4], w[ 5], selector); + w[ 5] = __byte_perm (w[ 3], w[ 4], selector); + w[ 4] = __byte_perm (w[ 2], w[ 3], selector); + w[ 3] = __byte_perm (w[ 1], w[ 2], selector); + w[ 2] = __byte_perm (w[ 0], w[ 1], selector); + w[ 1] = __byte_perm ( 0, w[ 0], selector); + w[ 0] = 0; + + break; + + case 2: + w[63] = __byte_perm (w[60], w[61], selector); + w[62] = __byte_perm (w[59], w[60], selector); + w[61] = __byte_perm (w[58], w[59], selector); + w[60] = __byte_perm (w[57], w[58], selector); + w[59] = __byte_perm (w[56], w[57], selector); + w[58] = __byte_perm (w[55], w[56], selector); + w[57] = __byte_perm (w[54], w[55], selector); + w[56] = __byte_perm (w[53], w[54], selector); + w[55] = __byte_perm (w[52], w[53], selector); + w[54] = __byte_perm (w[51], w[52], selector); + w[53] = __byte_perm (w[50], w[51], selector); + w[52] = __byte_perm (w[49], w[50], selector); + w[51] = __byte_perm (w[48], w[49], selector); + w[50] = __byte_perm (w[47], w[48], selector); + w[49] = __byte_perm (w[46], w[47], selector); + w[48] = __byte_perm (w[45], w[46], selector); + w[47] = __byte_perm (w[44], w[45], selector); + w[46] = __byte_perm (w[43], w[44], selector); + w[45] = __byte_perm (w[42], w[43], selector); + w[44] = __byte_perm (w[41], w[42], selector); + w[43] = __byte_perm (w[40], w[41], selector); + w[42] = __byte_perm (w[39], w[40], selector); + w[41] = __byte_perm (w[38], w[39], selector); + w[40] = __byte_perm (w[37], w[38], selector); + w[39] = __byte_perm (w[36], w[37], selector); + w[38] = __byte_perm (w[35], w[36], selector); + w[37] = __byte_perm (w[34], w[35], selector); + w[36] = __byte_perm (w[33], w[34], selector); + w[35] = __byte_perm (w[32], w[33], selector); + w[34] = __byte_perm (w[31], w[32], selector); + w[33] = __byte_perm (w[30], w[31], selector); + w[32] = __byte_perm (w[29], w[30], selector); + w[31] = __byte_perm (w[28], w[29], selector); + w[30] = __byte_perm (w[27], w[28], selector); + w[29] = __byte_perm (w[26], w[27], selector); + w[28] = __byte_perm (w[25], w[26], selector); + w[27] = __byte_perm (w[24], w[25], selector); + w[26] = __byte_perm (w[23], w[24], selector); + w[25] = __byte_perm (w[22], w[23], selector); + w[24] = __byte_perm (w[21], w[22], selector); + w[23] = __byte_perm (w[20], w[21], selector); + w[22] = __byte_perm (w[19], w[20], selector); + w[21] = __byte_perm (w[18], w[19], selector); + w[20] = __byte_perm (w[17], w[18], selector); + w[19] = __byte_perm (w[16], w[17], selector); + w[18] = __byte_perm (w[15], w[16], selector); + w[17] = __byte_perm (w[14], w[15], selector); + w[16] = __byte_perm (w[13], w[14], selector); + w[15] = __byte_perm (w[12], w[13], selector); + w[14] = __byte_perm (w[11], w[12], selector); + w[13] = __byte_perm (w[10], w[11], selector); + w[12] = __byte_perm (w[ 9], w[10], selector); + w[11] = __byte_perm (w[ 8], w[ 9], selector); + w[10] = __byte_perm (w[ 7], w[ 8], selector); + w[ 9] = __byte_perm (w[ 6], w[ 7], selector); + w[ 8] = __byte_perm (w[ 5], w[ 6], selector); + w[ 7] = __byte_perm (w[ 4], w[ 5], selector); + w[ 6] = __byte_perm (w[ 3], w[ 4], selector); + w[ 5] = __byte_perm (w[ 2], w[ 3], selector); + w[ 4] = __byte_perm (w[ 1], w[ 2], selector); + w[ 3] = __byte_perm (w[ 0], w[ 1], selector); + w[ 2] = __byte_perm ( 0, w[ 0], selector); + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 3: + w[63] = __byte_perm (w[59], w[60], selector); + w[62] = __byte_perm (w[58], w[59], selector); + w[61] = __byte_perm (w[57], w[58], selector); + w[60] = __byte_perm (w[56], w[57], selector); + w[59] = __byte_perm (w[55], w[56], selector); + w[58] = __byte_perm (w[54], w[55], selector); + w[57] = __byte_perm (w[53], w[54], selector); + w[56] = __byte_perm (w[52], w[53], selector); + w[55] = __byte_perm (w[51], w[52], selector); + w[54] = __byte_perm (w[50], w[51], selector); + w[53] = __byte_perm (w[49], w[50], selector); + w[52] = __byte_perm (w[48], w[49], selector); + w[51] = __byte_perm (w[47], w[48], selector); + w[50] = __byte_perm (w[46], w[47], selector); + w[49] = __byte_perm (w[45], w[46], selector); + w[48] = __byte_perm (w[44], w[45], selector); + w[47] = __byte_perm (w[43], w[44], selector); + w[46] = __byte_perm (w[42], w[43], selector); + w[45] = __byte_perm (w[41], w[42], selector); + w[44] = __byte_perm (w[40], w[41], selector); + w[43] = __byte_perm (w[39], w[40], selector); + w[42] = __byte_perm (w[38], w[39], selector); + w[41] = __byte_perm (w[37], w[38], selector); + w[40] = __byte_perm (w[36], w[37], selector); + w[39] = __byte_perm (w[35], w[36], selector); + w[38] = __byte_perm (w[34], w[35], selector); + w[37] = __byte_perm (w[33], w[34], selector); + w[36] = __byte_perm (w[32], w[33], selector); + w[35] = __byte_perm (w[31], w[32], selector); + w[34] = __byte_perm (w[30], w[31], selector); + w[33] = __byte_perm (w[29], w[30], selector); + w[32] = __byte_perm (w[28], w[29], selector); + w[31] = __byte_perm (w[27], w[28], selector); + w[30] = __byte_perm (w[26], w[27], selector); + w[29] = __byte_perm (w[25], w[26], selector); + w[28] = __byte_perm (w[24], w[25], selector); + w[27] = __byte_perm (w[23], w[24], selector); + w[26] = __byte_perm (w[22], w[23], selector); + w[25] = __byte_perm (w[21], w[22], selector); + w[24] = __byte_perm (w[20], w[21], selector); + w[23] = __byte_perm (w[19], w[20], selector); + w[22] = __byte_perm (w[18], w[19], selector); + w[21] = __byte_perm (w[17], w[18], selector); + w[20] = __byte_perm (w[16], w[17], selector); + w[19] = __byte_perm (w[15], w[16], selector); + w[18] = __byte_perm (w[14], w[15], selector); + w[17] = __byte_perm (w[13], w[14], selector); + w[16] = __byte_perm (w[12], w[13], selector); + w[15] = __byte_perm (w[11], w[12], selector); + w[14] = __byte_perm (w[10], w[11], selector); + w[13] = __byte_perm (w[ 9], w[10], selector); + w[12] = __byte_perm (w[ 8], w[ 9], selector); + w[11] = __byte_perm (w[ 7], w[ 8], selector); + w[10] = __byte_perm (w[ 6], w[ 7], selector); + w[ 9] = __byte_perm (w[ 5], w[ 6], selector); + w[ 8] = __byte_perm (w[ 4], w[ 5], selector); + w[ 7] = __byte_perm (w[ 3], w[ 4], selector); + w[ 6] = __byte_perm (w[ 2], w[ 3], selector); + w[ 5] = __byte_perm (w[ 1], w[ 2], selector); + w[ 4] = __byte_perm (w[ 0], w[ 1], selector); + w[ 3] = __byte_perm ( 0, w[ 0], selector); + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 4: + w[63] = __byte_perm (w[58], w[59], selector); + w[62] = __byte_perm (w[57], w[58], selector); + w[61] = __byte_perm (w[56], w[57], selector); + w[60] = __byte_perm (w[55], w[56], selector); + w[59] = __byte_perm (w[54], w[55], selector); + w[58] = __byte_perm (w[53], w[54], selector); + w[57] = __byte_perm (w[52], w[53], selector); + w[56] = __byte_perm (w[51], w[52], selector); + w[55] = __byte_perm (w[50], w[51], selector); + w[54] = __byte_perm (w[49], w[50], selector); + w[53] = __byte_perm (w[48], w[49], selector); + w[52] = __byte_perm (w[47], w[48], selector); + w[51] = __byte_perm (w[46], w[47], selector); + w[50] = __byte_perm (w[45], w[46], selector); + w[49] = __byte_perm (w[44], w[45], selector); + w[48] = __byte_perm (w[43], w[44], selector); + w[47] = __byte_perm (w[42], w[43], selector); + w[46] = __byte_perm (w[41], w[42], selector); + w[45] = __byte_perm (w[40], w[41], selector); + w[44] = __byte_perm (w[39], w[40], selector); + w[43] = __byte_perm (w[38], w[39], selector); + w[42] = __byte_perm (w[37], w[38], selector); + w[41] = __byte_perm (w[36], w[37], selector); + w[40] = __byte_perm (w[35], w[36], selector); + w[39] = __byte_perm (w[34], w[35], selector); + w[38] = __byte_perm (w[33], w[34], selector); + w[37] = __byte_perm (w[32], w[33], selector); + w[36] = __byte_perm (w[31], w[32], selector); + w[35] = __byte_perm (w[30], w[31], selector); + w[34] = __byte_perm (w[29], w[30], selector); + w[33] = __byte_perm (w[28], w[29], selector); + w[32] = __byte_perm (w[27], w[28], selector); + w[31] = __byte_perm (w[26], w[27], selector); + w[30] = __byte_perm (w[25], w[26], selector); + w[29] = __byte_perm (w[24], w[25], selector); + w[28] = __byte_perm (w[23], w[24], selector); + w[27] = __byte_perm (w[22], w[23], selector); + w[26] = __byte_perm (w[21], w[22], selector); + w[25] = __byte_perm (w[20], w[21], selector); + w[24] = __byte_perm (w[19], w[20], selector); + w[23] = __byte_perm (w[18], w[19], selector); + w[22] = __byte_perm (w[17], w[18], selector); + w[21] = __byte_perm (w[16], w[17], selector); + w[20] = __byte_perm (w[15], w[16], selector); + w[19] = __byte_perm (w[14], w[15], selector); + w[18] = __byte_perm (w[13], w[14], selector); + w[17] = __byte_perm (w[12], w[13], selector); + w[16] = __byte_perm (w[11], w[12], selector); + w[15] = __byte_perm (w[10], w[11], selector); + w[14] = __byte_perm (w[ 9], w[10], selector); + w[13] = __byte_perm (w[ 8], w[ 9], selector); + w[12] = __byte_perm (w[ 7], w[ 8], selector); + w[11] = __byte_perm (w[ 6], w[ 7], selector); + w[10] = __byte_perm (w[ 5], w[ 6], selector); + w[ 9] = __byte_perm (w[ 4], w[ 5], selector); + w[ 8] = __byte_perm (w[ 3], w[ 4], selector); + w[ 7] = __byte_perm (w[ 2], w[ 3], selector); + w[ 6] = __byte_perm (w[ 1], w[ 2], selector); + w[ 5] = __byte_perm (w[ 0], w[ 1], selector); + w[ 4] = __byte_perm ( 0, w[ 0], selector); + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 5: + w[63] = __byte_perm (w[57], w[58], selector); + w[62] = __byte_perm (w[56], w[57], selector); + w[61] = __byte_perm (w[55], w[56], selector); + w[60] = __byte_perm (w[54], w[55], selector); + w[59] = __byte_perm (w[53], w[54], selector); + w[58] = __byte_perm (w[52], w[53], selector); + w[57] = __byte_perm (w[51], w[52], selector); + w[56] = __byte_perm (w[50], w[51], selector); + w[55] = __byte_perm (w[49], w[50], selector); + w[54] = __byte_perm (w[48], w[49], selector); + w[53] = __byte_perm (w[47], w[48], selector); + w[52] = __byte_perm (w[46], w[47], selector); + w[51] = __byte_perm (w[45], w[46], selector); + w[50] = __byte_perm (w[44], w[45], selector); + w[49] = __byte_perm (w[43], w[44], selector); + w[48] = __byte_perm (w[42], w[43], selector); + w[47] = __byte_perm (w[41], w[42], selector); + w[46] = __byte_perm (w[40], w[41], selector); + w[45] = __byte_perm (w[39], w[40], selector); + w[44] = __byte_perm (w[38], w[39], selector); + w[43] = __byte_perm (w[37], w[38], selector); + w[42] = __byte_perm (w[36], w[37], selector); + w[41] = __byte_perm (w[35], w[36], selector); + w[40] = __byte_perm (w[34], w[35], selector); + w[39] = __byte_perm (w[33], w[34], selector); + w[38] = __byte_perm (w[32], w[33], selector); + w[37] = __byte_perm (w[31], w[32], selector); + w[36] = __byte_perm (w[30], w[31], selector); + w[35] = __byte_perm (w[29], w[30], selector); + w[34] = __byte_perm (w[28], w[29], selector); + w[33] = __byte_perm (w[27], w[28], selector); + w[32] = __byte_perm (w[26], w[27], selector); + w[31] = __byte_perm (w[25], w[26], selector); + w[30] = __byte_perm (w[24], w[25], selector); + w[29] = __byte_perm (w[23], w[24], selector); + w[28] = __byte_perm (w[22], w[23], selector); + w[27] = __byte_perm (w[21], w[22], selector); + w[26] = __byte_perm (w[20], w[21], selector); + w[25] = __byte_perm (w[19], w[20], selector); + w[24] = __byte_perm (w[18], w[19], selector); + w[23] = __byte_perm (w[17], w[18], selector); + w[22] = __byte_perm (w[16], w[17], selector); + w[21] = __byte_perm (w[15], w[16], selector); + w[20] = __byte_perm (w[14], w[15], selector); + w[19] = __byte_perm (w[13], w[14], selector); + w[18] = __byte_perm (w[12], w[13], selector); + w[17] = __byte_perm (w[11], w[12], selector); + w[16] = __byte_perm (w[10], w[11], selector); + w[15] = __byte_perm (w[ 9], w[10], selector); + w[14] = __byte_perm (w[ 8], w[ 9], selector); + w[13] = __byte_perm (w[ 7], w[ 8], selector); + w[12] = __byte_perm (w[ 6], w[ 7], selector); + w[11] = __byte_perm (w[ 5], w[ 6], selector); + w[10] = __byte_perm (w[ 4], w[ 5], selector); + w[ 9] = __byte_perm (w[ 3], w[ 4], selector); + w[ 8] = __byte_perm (w[ 2], w[ 3], selector); + w[ 7] = __byte_perm (w[ 1], w[ 2], selector); + w[ 6] = __byte_perm (w[ 0], w[ 1], selector); + w[ 5] = __byte_perm ( 0, w[ 0], selector); + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 6: + w[63] = __byte_perm (w[56], w[57], selector); + w[62] = __byte_perm (w[55], w[56], selector); + w[61] = __byte_perm (w[54], w[55], selector); + w[60] = __byte_perm (w[53], w[54], selector); + w[59] = __byte_perm (w[52], w[53], selector); + w[58] = __byte_perm (w[51], w[52], selector); + w[57] = __byte_perm (w[50], w[51], selector); + w[56] = __byte_perm (w[49], w[50], selector); + w[55] = __byte_perm (w[48], w[49], selector); + w[54] = __byte_perm (w[47], w[48], selector); + w[53] = __byte_perm (w[46], w[47], selector); + w[52] = __byte_perm (w[45], w[46], selector); + w[51] = __byte_perm (w[44], w[45], selector); + w[50] = __byte_perm (w[43], w[44], selector); + w[49] = __byte_perm (w[42], w[43], selector); + w[48] = __byte_perm (w[41], w[42], selector); + w[47] = __byte_perm (w[40], w[41], selector); + w[46] = __byte_perm (w[39], w[40], selector); + w[45] = __byte_perm (w[38], w[39], selector); + w[44] = __byte_perm (w[37], w[38], selector); + w[43] = __byte_perm (w[36], w[37], selector); + w[42] = __byte_perm (w[35], w[36], selector); + w[41] = __byte_perm (w[34], w[35], selector); + w[40] = __byte_perm (w[33], w[34], selector); + w[39] = __byte_perm (w[32], w[33], selector); + w[38] = __byte_perm (w[31], w[32], selector); + w[37] = __byte_perm (w[30], w[31], selector); + w[36] = __byte_perm (w[29], w[30], selector); + w[35] = __byte_perm (w[28], w[29], selector); + w[34] = __byte_perm (w[27], w[28], selector); + w[33] = __byte_perm (w[26], w[27], selector); + w[32] = __byte_perm (w[25], w[26], selector); + w[31] = __byte_perm (w[24], w[25], selector); + w[30] = __byte_perm (w[23], w[24], selector); + w[29] = __byte_perm (w[22], w[23], selector); + w[28] = __byte_perm (w[21], w[22], selector); + w[27] = __byte_perm (w[20], w[21], selector); + w[26] = __byte_perm (w[19], w[20], selector); + w[25] = __byte_perm (w[18], w[19], selector); + w[24] = __byte_perm (w[17], w[18], selector); + w[23] = __byte_perm (w[16], w[17], selector); + w[22] = __byte_perm (w[15], w[16], selector); + w[21] = __byte_perm (w[14], w[15], selector); + w[20] = __byte_perm (w[13], w[14], selector); + w[19] = __byte_perm (w[12], w[13], selector); + w[18] = __byte_perm (w[11], w[12], selector); + w[17] = __byte_perm (w[10], w[11], selector); + w[16] = __byte_perm (w[ 9], w[10], selector); + w[15] = __byte_perm (w[ 8], w[ 9], selector); + w[14] = __byte_perm (w[ 7], w[ 8], selector); + w[13] = __byte_perm (w[ 6], w[ 7], selector); + w[12] = __byte_perm (w[ 5], w[ 6], selector); + w[11] = __byte_perm (w[ 4], w[ 5], selector); + w[10] = __byte_perm (w[ 3], w[ 4], selector); + w[ 9] = __byte_perm (w[ 2], w[ 3], selector); + w[ 8] = __byte_perm (w[ 1], w[ 2], selector); + w[ 7] = __byte_perm (w[ 0], w[ 1], selector); + w[ 6] = __byte_perm ( 0, w[ 0], selector); + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 7: + w[63] = __byte_perm (w[55], w[56], selector); + w[62] = __byte_perm (w[54], w[55], selector); + w[61] = __byte_perm (w[53], w[54], selector); + w[60] = __byte_perm (w[52], w[53], selector); + w[59] = __byte_perm (w[51], w[52], selector); + w[58] = __byte_perm (w[50], w[51], selector); + w[57] = __byte_perm (w[49], w[50], selector); + w[56] = __byte_perm (w[48], w[49], selector); + w[55] = __byte_perm (w[47], w[48], selector); + w[54] = __byte_perm (w[46], w[47], selector); + w[53] = __byte_perm (w[45], w[46], selector); + w[52] = __byte_perm (w[44], w[45], selector); + w[51] = __byte_perm (w[43], w[44], selector); + w[50] = __byte_perm (w[42], w[43], selector); + w[49] = __byte_perm (w[41], w[42], selector); + w[48] = __byte_perm (w[40], w[41], selector); + w[47] = __byte_perm (w[39], w[40], selector); + w[46] = __byte_perm (w[38], w[39], selector); + w[45] = __byte_perm (w[37], w[38], selector); + w[44] = __byte_perm (w[36], w[37], selector); + w[43] = __byte_perm (w[35], w[36], selector); + w[42] = __byte_perm (w[34], w[35], selector); + w[41] = __byte_perm (w[33], w[34], selector); + w[40] = __byte_perm (w[32], w[33], selector); + w[39] = __byte_perm (w[31], w[32], selector); + w[38] = __byte_perm (w[30], w[31], selector); + w[37] = __byte_perm (w[29], w[30], selector); + w[36] = __byte_perm (w[28], w[29], selector); + w[35] = __byte_perm (w[27], w[28], selector); + w[34] = __byte_perm (w[26], w[27], selector); + w[33] = __byte_perm (w[25], w[26], selector); + w[32] = __byte_perm (w[24], w[25], selector); + w[31] = __byte_perm (w[23], w[24], selector); + w[30] = __byte_perm (w[22], w[23], selector); + w[29] = __byte_perm (w[21], w[22], selector); + w[28] = __byte_perm (w[20], w[21], selector); + w[27] = __byte_perm (w[19], w[20], selector); + w[26] = __byte_perm (w[18], w[19], selector); + w[25] = __byte_perm (w[17], w[18], selector); + w[24] = __byte_perm (w[16], w[17], selector); + w[23] = __byte_perm (w[15], w[16], selector); + w[22] = __byte_perm (w[14], w[15], selector); + w[21] = __byte_perm (w[13], w[14], selector); + w[20] = __byte_perm (w[12], w[13], selector); + w[19] = __byte_perm (w[11], w[12], selector); + w[18] = __byte_perm (w[10], w[11], selector); + w[17] = __byte_perm (w[ 9], w[10], selector); + w[16] = __byte_perm (w[ 8], w[ 9], selector); + w[15] = __byte_perm (w[ 7], w[ 8], selector); + w[14] = __byte_perm (w[ 6], w[ 7], selector); + w[13] = __byte_perm (w[ 5], w[ 6], selector); + w[12] = __byte_perm (w[ 4], w[ 5], selector); + w[11] = __byte_perm (w[ 3], w[ 4], selector); + w[10] = __byte_perm (w[ 2], w[ 3], selector); + w[ 9] = __byte_perm (w[ 1], w[ 2], selector); + w[ 8] = __byte_perm (w[ 0], w[ 1], selector); + w[ 7] = __byte_perm ( 0, w[ 0], selector); + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 8: + w[63] = __byte_perm (w[54], w[55], selector); + w[62] = __byte_perm (w[53], w[54], selector); + w[61] = __byte_perm (w[52], w[53], selector); + w[60] = __byte_perm (w[51], w[52], selector); + w[59] = __byte_perm (w[50], w[51], selector); + w[58] = __byte_perm (w[49], w[50], selector); + w[57] = __byte_perm (w[48], w[49], selector); + w[56] = __byte_perm (w[47], w[48], selector); + w[55] = __byte_perm (w[46], w[47], selector); + w[54] = __byte_perm (w[45], w[46], selector); + w[53] = __byte_perm (w[44], w[45], selector); + w[52] = __byte_perm (w[43], w[44], selector); + w[51] = __byte_perm (w[42], w[43], selector); + w[50] = __byte_perm (w[41], w[42], selector); + w[49] = __byte_perm (w[40], w[41], selector); + w[48] = __byte_perm (w[39], w[40], selector); + w[47] = __byte_perm (w[38], w[39], selector); + w[46] = __byte_perm (w[37], w[38], selector); + w[45] = __byte_perm (w[36], w[37], selector); + w[44] = __byte_perm (w[35], w[36], selector); + w[43] = __byte_perm (w[34], w[35], selector); + w[42] = __byte_perm (w[33], w[34], selector); + w[41] = __byte_perm (w[32], w[33], selector); + w[40] = __byte_perm (w[31], w[32], selector); + w[39] = __byte_perm (w[30], w[31], selector); + w[38] = __byte_perm (w[29], w[30], selector); + w[37] = __byte_perm (w[28], w[29], selector); + w[36] = __byte_perm (w[27], w[28], selector); + w[35] = __byte_perm (w[26], w[27], selector); + w[34] = __byte_perm (w[25], w[26], selector); + w[33] = __byte_perm (w[24], w[25], selector); + w[32] = __byte_perm (w[23], w[24], selector); + w[31] = __byte_perm (w[22], w[23], selector); + w[30] = __byte_perm (w[21], w[22], selector); + w[29] = __byte_perm (w[20], w[21], selector); + w[28] = __byte_perm (w[19], w[20], selector); + w[27] = __byte_perm (w[18], w[19], selector); + w[26] = __byte_perm (w[17], w[18], selector); + w[25] = __byte_perm (w[16], w[17], selector); + w[24] = __byte_perm (w[15], w[16], selector); + w[23] = __byte_perm (w[14], w[15], selector); + w[22] = __byte_perm (w[13], w[14], selector); + w[21] = __byte_perm (w[12], w[13], selector); + w[20] = __byte_perm (w[11], w[12], selector); + w[19] = __byte_perm (w[10], w[11], selector); + w[18] = __byte_perm (w[ 9], w[10], selector); + w[17] = __byte_perm (w[ 8], w[ 9], selector); + w[16] = __byte_perm (w[ 7], w[ 8], selector); + w[15] = __byte_perm (w[ 6], w[ 7], selector); + w[14] = __byte_perm (w[ 5], w[ 6], selector); + w[13] = __byte_perm (w[ 4], w[ 5], selector); + w[12] = __byte_perm (w[ 3], w[ 4], selector); + w[11] = __byte_perm (w[ 2], w[ 3], selector); + w[10] = __byte_perm (w[ 1], w[ 2], selector); + w[ 9] = __byte_perm (w[ 0], w[ 1], selector); + w[ 8] = __byte_perm ( 0, w[ 0], selector); + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 9: + w[63] = __byte_perm (w[53], w[54], selector); + w[62] = __byte_perm (w[52], w[53], selector); + w[61] = __byte_perm (w[51], w[52], selector); + w[60] = __byte_perm (w[50], w[51], selector); + w[59] = __byte_perm (w[49], w[50], selector); + w[58] = __byte_perm (w[48], w[49], selector); + w[57] = __byte_perm (w[47], w[48], selector); + w[56] = __byte_perm (w[46], w[47], selector); + w[55] = __byte_perm (w[45], w[46], selector); + w[54] = __byte_perm (w[44], w[45], selector); + w[53] = __byte_perm (w[43], w[44], selector); + w[52] = __byte_perm (w[42], w[43], selector); + w[51] = __byte_perm (w[41], w[42], selector); + w[50] = __byte_perm (w[40], w[41], selector); + w[49] = __byte_perm (w[39], w[40], selector); + w[48] = __byte_perm (w[38], w[39], selector); + w[47] = __byte_perm (w[37], w[38], selector); + w[46] = __byte_perm (w[36], w[37], selector); + w[45] = __byte_perm (w[35], w[36], selector); + w[44] = __byte_perm (w[34], w[35], selector); + w[43] = __byte_perm (w[33], w[34], selector); + w[42] = __byte_perm (w[32], w[33], selector); + w[41] = __byte_perm (w[31], w[32], selector); + w[40] = __byte_perm (w[30], w[31], selector); + w[39] = __byte_perm (w[29], w[30], selector); + w[38] = __byte_perm (w[28], w[29], selector); + w[37] = __byte_perm (w[27], w[28], selector); + w[36] = __byte_perm (w[26], w[27], selector); + w[35] = __byte_perm (w[25], w[26], selector); + w[34] = __byte_perm (w[24], w[25], selector); + w[33] = __byte_perm (w[23], w[24], selector); + w[32] = __byte_perm (w[22], w[23], selector); + w[31] = __byte_perm (w[21], w[22], selector); + w[30] = __byte_perm (w[20], w[21], selector); + w[29] = __byte_perm (w[19], w[20], selector); + w[28] = __byte_perm (w[18], w[19], selector); + w[27] = __byte_perm (w[17], w[18], selector); + w[26] = __byte_perm (w[16], w[17], selector); + w[25] = __byte_perm (w[15], w[16], selector); + w[24] = __byte_perm (w[14], w[15], selector); + w[23] = __byte_perm (w[13], w[14], selector); + w[22] = __byte_perm (w[12], w[13], selector); + w[21] = __byte_perm (w[11], w[12], selector); + w[20] = __byte_perm (w[10], w[11], selector); + w[19] = __byte_perm (w[ 9], w[10], selector); + w[18] = __byte_perm (w[ 8], w[ 9], selector); + w[17] = __byte_perm (w[ 7], w[ 8], selector); + w[16] = __byte_perm (w[ 6], w[ 7], selector); + w[15] = __byte_perm (w[ 5], w[ 6], selector); + w[14] = __byte_perm (w[ 4], w[ 5], selector); + w[13] = __byte_perm (w[ 3], w[ 4], selector); + w[12] = __byte_perm (w[ 2], w[ 3], selector); + w[11] = __byte_perm (w[ 1], w[ 2], selector); + w[10] = __byte_perm (w[ 0], w[ 1], selector); + w[ 9] = __byte_perm ( 0, w[ 0], selector); + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 10: + w[63] = __byte_perm (w[52], w[53], selector); + w[62] = __byte_perm (w[51], w[52], selector); + w[61] = __byte_perm (w[50], w[51], selector); + w[60] = __byte_perm (w[49], w[50], selector); + w[59] = __byte_perm (w[48], w[49], selector); + w[58] = __byte_perm (w[47], w[48], selector); + w[57] = __byte_perm (w[46], w[47], selector); + w[56] = __byte_perm (w[45], w[46], selector); + w[55] = __byte_perm (w[44], w[45], selector); + w[54] = __byte_perm (w[43], w[44], selector); + w[53] = __byte_perm (w[42], w[43], selector); + w[52] = __byte_perm (w[41], w[42], selector); + w[51] = __byte_perm (w[40], w[41], selector); + w[50] = __byte_perm (w[39], w[40], selector); + w[49] = __byte_perm (w[38], w[39], selector); + w[48] = __byte_perm (w[37], w[38], selector); + w[47] = __byte_perm (w[36], w[37], selector); + w[46] = __byte_perm (w[35], w[36], selector); + w[45] = __byte_perm (w[34], w[35], selector); + w[44] = __byte_perm (w[33], w[34], selector); + w[43] = __byte_perm (w[32], w[33], selector); + w[42] = __byte_perm (w[31], w[32], selector); + w[41] = __byte_perm (w[30], w[31], selector); + w[40] = __byte_perm (w[29], w[30], selector); + w[39] = __byte_perm (w[28], w[29], selector); + w[38] = __byte_perm (w[27], w[28], selector); + w[37] = __byte_perm (w[26], w[27], selector); + w[36] = __byte_perm (w[25], w[26], selector); + w[35] = __byte_perm (w[24], w[25], selector); + w[34] = __byte_perm (w[23], w[24], selector); + w[33] = __byte_perm (w[22], w[23], selector); + w[32] = __byte_perm (w[21], w[22], selector); + w[31] = __byte_perm (w[20], w[21], selector); + w[30] = __byte_perm (w[19], w[20], selector); + w[29] = __byte_perm (w[18], w[19], selector); + w[28] = __byte_perm (w[17], w[18], selector); + w[27] = __byte_perm (w[16], w[17], selector); + w[26] = __byte_perm (w[15], w[16], selector); + w[25] = __byte_perm (w[14], w[15], selector); + w[24] = __byte_perm (w[13], w[14], selector); + w[23] = __byte_perm (w[12], w[13], selector); + w[22] = __byte_perm (w[11], w[12], selector); + w[21] = __byte_perm (w[10], w[11], selector); + w[20] = __byte_perm (w[ 9], w[10], selector); + w[19] = __byte_perm (w[ 8], w[ 9], selector); + w[18] = __byte_perm (w[ 7], w[ 8], selector); + w[17] = __byte_perm (w[ 6], w[ 7], selector); + w[16] = __byte_perm (w[ 5], w[ 6], selector); + w[15] = __byte_perm (w[ 4], w[ 5], selector); + w[14] = __byte_perm (w[ 3], w[ 4], selector); + w[13] = __byte_perm (w[ 2], w[ 3], selector); + w[12] = __byte_perm (w[ 1], w[ 2], selector); + w[11] = __byte_perm (w[ 0], w[ 1], selector); + w[10] = __byte_perm ( 0, w[ 0], selector); + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 11: + w[63] = __byte_perm (w[51], w[52], selector); + w[62] = __byte_perm (w[50], w[51], selector); + w[61] = __byte_perm (w[49], w[50], selector); + w[60] = __byte_perm (w[48], w[49], selector); + w[59] = __byte_perm (w[47], w[48], selector); + w[58] = __byte_perm (w[46], w[47], selector); + w[57] = __byte_perm (w[45], w[46], selector); + w[56] = __byte_perm (w[44], w[45], selector); + w[55] = __byte_perm (w[43], w[44], selector); + w[54] = __byte_perm (w[42], w[43], selector); + w[53] = __byte_perm (w[41], w[42], selector); + w[52] = __byte_perm (w[40], w[41], selector); + w[51] = __byte_perm (w[39], w[40], selector); + w[50] = __byte_perm (w[38], w[39], selector); + w[49] = __byte_perm (w[37], w[38], selector); + w[48] = __byte_perm (w[36], w[37], selector); + w[47] = __byte_perm (w[35], w[36], selector); + w[46] = __byte_perm (w[34], w[35], selector); + w[45] = __byte_perm (w[33], w[34], selector); + w[44] = __byte_perm (w[32], w[33], selector); + w[43] = __byte_perm (w[31], w[32], selector); + w[42] = __byte_perm (w[30], w[31], selector); + w[41] = __byte_perm (w[29], w[30], selector); + w[40] = __byte_perm (w[28], w[29], selector); + w[39] = __byte_perm (w[27], w[28], selector); + w[38] = __byte_perm (w[26], w[27], selector); + w[37] = __byte_perm (w[25], w[26], selector); + w[36] = __byte_perm (w[24], w[25], selector); + w[35] = __byte_perm (w[23], w[24], selector); + w[34] = __byte_perm (w[22], w[23], selector); + w[33] = __byte_perm (w[21], w[22], selector); + w[32] = __byte_perm (w[20], w[21], selector); + w[31] = __byte_perm (w[19], w[20], selector); + w[30] = __byte_perm (w[18], w[19], selector); + w[29] = __byte_perm (w[17], w[18], selector); + w[28] = __byte_perm (w[16], w[17], selector); + w[27] = __byte_perm (w[15], w[16], selector); + w[26] = __byte_perm (w[14], w[15], selector); + w[25] = __byte_perm (w[13], w[14], selector); + w[24] = __byte_perm (w[12], w[13], selector); + w[23] = __byte_perm (w[11], w[12], selector); + w[22] = __byte_perm (w[10], w[11], selector); + w[21] = __byte_perm (w[ 9], w[10], selector); + w[20] = __byte_perm (w[ 8], w[ 9], selector); + w[19] = __byte_perm (w[ 7], w[ 8], selector); + w[18] = __byte_perm (w[ 6], w[ 7], selector); + w[17] = __byte_perm (w[ 5], w[ 6], selector); + w[16] = __byte_perm (w[ 4], w[ 5], selector); + w[15] = __byte_perm (w[ 3], w[ 4], selector); + w[14] = __byte_perm (w[ 2], w[ 3], selector); + w[13] = __byte_perm (w[ 1], w[ 2], selector); + w[12] = __byte_perm (w[ 0], w[ 1], selector); + w[11] = __byte_perm ( 0, w[ 0], selector); + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 12: + w[63] = __byte_perm (w[50], w[51], selector); + w[62] = __byte_perm (w[49], w[50], selector); + w[61] = __byte_perm (w[48], w[49], selector); + w[60] = __byte_perm (w[47], w[48], selector); + w[59] = __byte_perm (w[46], w[47], selector); + w[58] = __byte_perm (w[45], w[46], selector); + w[57] = __byte_perm (w[44], w[45], selector); + w[56] = __byte_perm (w[43], w[44], selector); + w[55] = __byte_perm (w[42], w[43], selector); + w[54] = __byte_perm (w[41], w[42], selector); + w[53] = __byte_perm (w[40], w[41], selector); + w[52] = __byte_perm (w[39], w[40], selector); + w[51] = __byte_perm (w[38], w[39], selector); + w[50] = __byte_perm (w[37], w[38], selector); + w[49] = __byte_perm (w[36], w[37], selector); + w[48] = __byte_perm (w[35], w[36], selector); + w[47] = __byte_perm (w[34], w[35], selector); + w[46] = __byte_perm (w[33], w[34], selector); + w[45] = __byte_perm (w[32], w[33], selector); + w[44] = __byte_perm (w[31], w[32], selector); + w[43] = __byte_perm (w[30], w[31], selector); + w[42] = __byte_perm (w[29], w[30], selector); + w[41] = __byte_perm (w[28], w[29], selector); + w[40] = __byte_perm (w[27], w[28], selector); + w[39] = __byte_perm (w[26], w[27], selector); + w[38] = __byte_perm (w[25], w[26], selector); + w[37] = __byte_perm (w[24], w[25], selector); + w[36] = __byte_perm (w[23], w[24], selector); + w[35] = __byte_perm (w[22], w[23], selector); + w[34] = __byte_perm (w[21], w[22], selector); + w[33] = __byte_perm (w[20], w[21], selector); + w[32] = __byte_perm (w[19], w[20], selector); + w[31] = __byte_perm (w[18], w[19], selector); + w[30] = __byte_perm (w[17], w[18], selector); + w[29] = __byte_perm (w[16], w[17], selector); + w[28] = __byte_perm (w[15], w[16], selector); + w[27] = __byte_perm (w[14], w[15], selector); + w[26] = __byte_perm (w[13], w[14], selector); + w[25] = __byte_perm (w[12], w[13], selector); + w[24] = __byte_perm (w[11], w[12], selector); + w[23] = __byte_perm (w[10], w[11], selector); + w[22] = __byte_perm (w[ 9], w[10], selector); + w[21] = __byte_perm (w[ 8], w[ 9], selector); + w[20] = __byte_perm (w[ 7], w[ 8], selector); + w[19] = __byte_perm (w[ 6], w[ 7], selector); + w[18] = __byte_perm (w[ 5], w[ 6], selector); + w[17] = __byte_perm (w[ 4], w[ 5], selector); + w[16] = __byte_perm (w[ 3], w[ 4], selector); + w[15] = __byte_perm (w[ 2], w[ 3], selector); + w[14] = __byte_perm (w[ 1], w[ 2], selector); + w[13] = __byte_perm (w[ 0], w[ 1], selector); + w[12] = __byte_perm ( 0, w[ 0], selector); + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 13: + w[63] = __byte_perm (w[49], w[50], selector); + w[62] = __byte_perm (w[48], w[49], selector); + w[61] = __byte_perm (w[47], w[48], selector); + w[60] = __byte_perm (w[46], w[47], selector); + w[59] = __byte_perm (w[45], w[46], selector); + w[58] = __byte_perm (w[44], w[45], selector); + w[57] = __byte_perm (w[43], w[44], selector); + w[56] = __byte_perm (w[42], w[43], selector); + w[55] = __byte_perm (w[41], w[42], selector); + w[54] = __byte_perm (w[40], w[41], selector); + w[53] = __byte_perm (w[39], w[40], selector); + w[52] = __byte_perm (w[38], w[39], selector); + w[51] = __byte_perm (w[37], w[38], selector); + w[50] = __byte_perm (w[36], w[37], selector); + w[49] = __byte_perm (w[35], w[36], selector); + w[48] = __byte_perm (w[34], w[35], selector); + w[47] = __byte_perm (w[33], w[34], selector); + w[46] = __byte_perm (w[32], w[33], selector); + w[45] = __byte_perm (w[31], w[32], selector); + w[44] = __byte_perm (w[30], w[31], selector); + w[43] = __byte_perm (w[29], w[30], selector); + w[42] = __byte_perm (w[28], w[29], selector); + w[41] = __byte_perm (w[27], w[28], selector); + w[40] = __byte_perm (w[26], w[27], selector); + w[39] = __byte_perm (w[25], w[26], selector); + w[38] = __byte_perm (w[24], w[25], selector); + w[37] = __byte_perm (w[23], w[24], selector); + w[36] = __byte_perm (w[22], w[23], selector); + w[35] = __byte_perm (w[21], w[22], selector); + w[34] = __byte_perm (w[20], w[21], selector); + w[33] = __byte_perm (w[19], w[20], selector); + w[32] = __byte_perm (w[18], w[19], selector); + w[31] = __byte_perm (w[17], w[18], selector); + w[30] = __byte_perm (w[16], w[17], selector); + w[29] = __byte_perm (w[15], w[16], selector); + w[28] = __byte_perm (w[14], w[15], selector); + w[27] = __byte_perm (w[13], w[14], selector); + w[26] = __byte_perm (w[12], w[13], selector); + w[25] = __byte_perm (w[11], w[12], selector); + w[24] = __byte_perm (w[10], w[11], selector); + w[23] = __byte_perm (w[ 9], w[10], selector); + w[22] = __byte_perm (w[ 8], w[ 9], selector); + w[21] = __byte_perm (w[ 7], w[ 8], selector); + w[20] = __byte_perm (w[ 6], w[ 7], selector); + w[19] = __byte_perm (w[ 5], w[ 6], selector); + w[18] = __byte_perm (w[ 4], w[ 5], selector); + w[17] = __byte_perm (w[ 3], w[ 4], selector); + w[16] = __byte_perm (w[ 2], w[ 3], selector); + w[15] = __byte_perm (w[ 1], w[ 2], selector); + w[14] = __byte_perm (w[ 0], w[ 1], selector); + w[13] = __byte_perm ( 0, w[ 0], selector); + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 14: + w[63] = __byte_perm (w[48], w[49], selector); + w[62] = __byte_perm (w[47], w[48], selector); + w[61] = __byte_perm (w[46], w[47], selector); + w[60] = __byte_perm (w[45], w[46], selector); + w[59] = __byte_perm (w[44], w[45], selector); + w[58] = __byte_perm (w[43], w[44], selector); + w[57] = __byte_perm (w[42], w[43], selector); + w[56] = __byte_perm (w[41], w[42], selector); + w[55] = __byte_perm (w[40], w[41], selector); + w[54] = __byte_perm (w[39], w[40], selector); + w[53] = __byte_perm (w[38], w[39], selector); + w[52] = __byte_perm (w[37], w[38], selector); + w[51] = __byte_perm (w[36], w[37], selector); + w[50] = __byte_perm (w[35], w[36], selector); + w[49] = __byte_perm (w[34], w[35], selector); + w[48] = __byte_perm (w[33], w[34], selector); + w[47] = __byte_perm (w[32], w[33], selector); + w[46] = __byte_perm (w[31], w[32], selector); + w[45] = __byte_perm (w[30], w[31], selector); + w[44] = __byte_perm (w[29], w[30], selector); + w[43] = __byte_perm (w[28], w[29], selector); + w[42] = __byte_perm (w[27], w[28], selector); + w[41] = __byte_perm (w[26], w[27], selector); + w[40] = __byte_perm (w[25], w[26], selector); + w[39] = __byte_perm (w[24], w[25], selector); + w[38] = __byte_perm (w[23], w[24], selector); + w[37] = __byte_perm (w[22], w[23], selector); + w[36] = __byte_perm (w[21], w[22], selector); + w[35] = __byte_perm (w[20], w[21], selector); + w[34] = __byte_perm (w[19], w[20], selector); + w[33] = __byte_perm (w[18], w[19], selector); + w[32] = __byte_perm (w[17], w[18], selector); + w[31] = __byte_perm (w[16], w[17], selector); + w[30] = __byte_perm (w[15], w[16], selector); + w[29] = __byte_perm (w[14], w[15], selector); + w[28] = __byte_perm (w[13], w[14], selector); + w[27] = __byte_perm (w[12], w[13], selector); + w[26] = __byte_perm (w[11], w[12], selector); + w[25] = __byte_perm (w[10], w[11], selector); + w[24] = __byte_perm (w[ 9], w[10], selector); + w[23] = __byte_perm (w[ 8], w[ 9], selector); + w[22] = __byte_perm (w[ 7], w[ 8], selector); + w[21] = __byte_perm (w[ 6], w[ 7], selector); + w[20] = __byte_perm (w[ 5], w[ 6], selector); + w[19] = __byte_perm (w[ 4], w[ 5], selector); + w[18] = __byte_perm (w[ 3], w[ 4], selector); + w[17] = __byte_perm (w[ 2], w[ 3], selector); + w[16] = __byte_perm (w[ 1], w[ 2], selector); + w[15] = __byte_perm (w[ 0], w[ 1], selector); + w[14] = __byte_perm ( 0, w[ 0], selector); + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 15: + w[63] = __byte_perm (w[47], w[48], selector); + w[62] = __byte_perm (w[46], w[47], selector); + w[61] = __byte_perm (w[45], w[46], selector); + w[60] = __byte_perm (w[44], w[45], selector); + w[59] = __byte_perm (w[43], w[44], selector); + w[58] = __byte_perm (w[42], w[43], selector); + w[57] = __byte_perm (w[41], w[42], selector); + w[56] = __byte_perm (w[40], w[41], selector); + w[55] = __byte_perm (w[39], w[40], selector); + w[54] = __byte_perm (w[38], w[39], selector); + w[53] = __byte_perm (w[37], w[38], selector); + w[52] = __byte_perm (w[36], w[37], selector); + w[51] = __byte_perm (w[35], w[36], selector); + w[50] = __byte_perm (w[34], w[35], selector); + w[49] = __byte_perm (w[33], w[34], selector); + w[48] = __byte_perm (w[32], w[33], selector); + w[47] = __byte_perm (w[31], w[32], selector); + w[46] = __byte_perm (w[30], w[31], selector); + w[45] = __byte_perm (w[29], w[30], selector); + w[44] = __byte_perm (w[28], w[29], selector); + w[43] = __byte_perm (w[27], w[28], selector); + w[42] = __byte_perm (w[26], w[27], selector); + w[41] = __byte_perm (w[25], w[26], selector); + w[40] = __byte_perm (w[24], w[25], selector); + w[39] = __byte_perm (w[23], w[24], selector); + w[38] = __byte_perm (w[22], w[23], selector); + w[37] = __byte_perm (w[21], w[22], selector); + w[36] = __byte_perm (w[20], w[21], selector); + w[35] = __byte_perm (w[19], w[20], selector); + w[34] = __byte_perm (w[18], w[19], selector); + w[33] = __byte_perm (w[17], w[18], selector); + w[32] = __byte_perm (w[16], w[17], selector); + w[31] = __byte_perm (w[15], w[16], selector); + w[30] = __byte_perm (w[14], w[15], selector); + w[29] = __byte_perm (w[13], w[14], selector); + w[28] = __byte_perm (w[12], w[13], selector); + w[27] = __byte_perm (w[11], w[12], selector); + w[26] = __byte_perm (w[10], w[11], selector); + w[25] = __byte_perm (w[ 9], w[10], selector); + w[24] = __byte_perm (w[ 8], w[ 9], selector); + w[23] = __byte_perm (w[ 7], w[ 8], selector); + w[22] = __byte_perm (w[ 6], w[ 7], selector); + w[21] = __byte_perm (w[ 5], w[ 6], selector); + w[20] = __byte_perm (w[ 4], w[ 5], selector); + w[19] = __byte_perm (w[ 3], w[ 4], selector); + w[18] = __byte_perm (w[ 2], w[ 3], selector); + w[17] = __byte_perm (w[ 1], w[ 2], selector); + w[16] = __byte_perm (w[ 0], w[ 1], selector); + w[15] = __byte_perm ( 0, w[ 0], selector); + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 16: + w[63] = __byte_perm (w[46], w[47], selector); + w[62] = __byte_perm (w[45], w[46], selector); + w[61] = __byte_perm (w[44], w[45], selector); + w[60] = __byte_perm (w[43], w[44], selector); + w[59] = __byte_perm (w[42], w[43], selector); + w[58] = __byte_perm (w[41], w[42], selector); + w[57] = __byte_perm (w[40], w[41], selector); + w[56] = __byte_perm (w[39], w[40], selector); + w[55] = __byte_perm (w[38], w[39], selector); + w[54] = __byte_perm (w[37], w[38], selector); + w[53] = __byte_perm (w[36], w[37], selector); + w[52] = __byte_perm (w[35], w[36], selector); + w[51] = __byte_perm (w[34], w[35], selector); + w[50] = __byte_perm (w[33], w[34], selector); + w[49] = __byte_perm (w[32], w[33], selector); + w[48] = __byte_perm (w[31], w[32], selector); + w[47] = __byte_perm (w[30], w[31], selector); + w[46] = __byte_perm (w[29], w[30], selector); + w[45] = __byte_perm (w[28], w[29], selector); + w[44] = __byte_perm (w[27], w[28], selector); + w[43] = __byte_perm (w[26], w[27], selector); + w[42] = __byte_perm (w[25], w[26], selector); + w[41] = __byte_perm (w[24], w[25], selector); + w[40] = __byte_perm (w[23], w[24], selector); + w[39] = __byte_perm (w[22], w[23], selector); + w[38] = __byte_perm (w[21], w[22], selector); + w[37] = __byte_perm (w[20], w[21], selector); + w[36] = __byte_perm (w[19], w[20], selector); + w[35] = __byte_perm (w[18], w[19], selector); + w[34] = __byte_perm (w[17], w[18], selector); + w[33] = __byte_perm (w[16], w[17], selector); + w[32] = __byte_perm (w[15], w[16], selector); + w[31] = __byte_perm (w[14], w[15], selector); + w[30] = __byte_perm (w[13], w[14], selector); + w[29] = __byte_perm (w[12], w[13], selector); + w[28] = __byte_perm (w[11], w[12], selector); + w[27] = __byte_perm (w[10], w[11], selector); + w[26] = __byte_perm (w[ 9], w[10], selector); + w[25] = __byte_perm (w[ 8], w[ 9], selector); + w[24] = __byte_perm (w[ 7], w[ 8], selector); + w[23] = __byte_perm (w[ 6], w[ 7], selector); + w[22] = __byte_perm (w[ 5], w[ 6], selector); + w[21] = __byte_perm (w[ 4], w[ 5], selector); + w[20] = __byte_perm (w[ 3], w[ 4], selector); + w[19] = __byte_perm (w[ 2], w[ 3], selector); + w[18] = __byte_perm (w[ 1], w[ 2], selector); + w[17] = __byte_perm (w[ 0], w[ 1], selector); + w[16] = __byte_perm ( 0, w[ 0], selector); + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 17: + w[63] = __byte_perm (w[45], w[46], selector); + w[62] = __byte_perm (w[44], w[45], selector); + w[61] = __byte_perm (w[43], w[44], selector); + w[60] = __byte_perm (w[42], w[43], selector); + w[59] = __byte_perm (w[41], w[42], selector); + w[58] = __byte_perm (w[40], w[41], selector); + w[57] = __byte_perm (w[39], w[40], selector); + w[56] = __byte_perm (w[38], w[39], selector); + w[55] = __byte_perm (w[37], w[38], selector); + w[54] = __byte_perm (w[36], w[37], selector); + w[53] = __byte_perm (w[35], w[36], selector); + w[52] = __byte_perm (w[34], w[35], selector); + w[51] = __byte_perm (w[33], w[34], selector); + w[50] = __byte_perm (w[32], w[33], selector); + w[49] = __byte_perm (w[31], w[32], selector); + w[48] = __byte_perm (w[30], w[31], selector); + w[47] = __byte_perm (w[29], w[30], selector); + w[46] = __byte_perm (w[28], w[29], selector); + w[45] = __byte_perm (w[27], w[28], selector); + w[44] = __byte_perm (w[26], w[27], selector); + w[43] = __byte_perm (w[25], w[26], selector); + w[42] = __byte_perm (w[24], w[25], selector); + w[41] = __byte_perm (w[23], w[24], selector); + w[40] = __byte_perm (w[22], w[23], selector); + w[39] = __byte_perm (w[21], w[22], selector); + w[38] = __byte_perm (w[20], w[21], selector); + w[37] = __byte_perm (w[19], w[20], selector); + w[36] = __byte_perm (w[18], w[19], selector); + w[35] = __byte_perm (w[17], w[18], selector); + w[34] = __byte_perm (w[16], w[17], selector); + w[33] = __byte_perm (w[15], w[16], selector); + w[32] = __byte_perm (w[14], w[15], selector); + w[31] = __byte_perm (w[13], w[14], selector); + w[30] = __byte_perm (w[12], w[13], selector); + w[29] = __byte_perm (w[11], w[12], selector); + w[28] = __byte_perm (w[10], w[11], selector); + w[27] = __byte_perm (w[ 9], w[10], selector); + w[26] = __byte_perm (w[ 8], w[ 9], selector); + w[25] = __byte_perm (w[ 7], w[ 8], selector); + w[24] = __byte_perm (w[ 6], w[ 7], selector); + w[23] = __byte_perm (w[ 5], w[ 6], selector); + w[22] = __byte_perm (w[ 4], w[ 5], selector); + w[21] = __byte_perm (w[ 3], w[ 4], selector); + w[20] = __byte_perm (w[ 2], w[ 3], selector); + w[19] = __byte_perm (w[ 1], w[ 2], selector); + w[18] = __byte_perm (w[ 0], w[ 1], selector); + w[17] = __byte_perm ( 0, w[ 0], selector); + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 18: + w[63] = __byte_perm (w[44], w[45], selector); + w[62] = __byte_perm (w[43], w[44], selector); + w[61] = __byte_perm (w[42], w[43], selector); + w[60] = __byte_perm (w[41], w[42], selector); + w[59] = __byte_perm (w[40], w[41], selector); + w[58] = __byte_perm (w[39], w[40], selector); + w[57] = __byte_perm (w[38], w[39], selector); + w[56] = __byte_perm (w[37], w[38], selector); + w[55] = __byte_perm (w[36], w[37], selector); + w[54] = __byte_perm (w[35], w[36], selector); + w[53] = __byte_perm (w[34], w[35], selector); + w[52] = __byte_perm (w[33], w[34], selector); + w[51] = __byte_perm (w[32], w[33], selector); + w[50] = __byte_perm (w[31], w[32], selector); + w[49] = __byte_perm (w[30], w[31], selector); + w[48] = __byte_perm (w[29], w[30], selector); + w[47] = __byte_perm (w[28], w[29], selector); + w[46] = __byte_perm (w[27], w[28], selector); + w[45] = __byte_perm (w[26], w[27], selector); + w[44] = __byte_perm (w[25], w[26], selector); + w[43] = __byte_perm (w[24], w[25], selector); + w[42] = __byte_perm (w[23], w[24], selector); + w[41] = __byte_perm (w[22], w[23], selector); + w[40] = __byte_perm (w[21], w[22], selector); + w[39] = __byte_perm (w[20], w[21], selector); + w[38] = __byte_perm (w[19], w[20], selector); + w[37] = __byte_perm (w[18], w[19], selector); + w[36] = __byte_perm (w[17], w[18], selector); + w[35] = __byte_perm (w[16], w[17], selector); + w[34] = __byte_perm (w[15], w[16], selector); + w[33] = __byte_perm (w[14], w[15], selector); + w[32] = __byte_perm (w[13], w[14], selector); + w[31] = __byte_perm (w[12], w[13], selector); + w[30] = __byte_perm (w[11], w[12], selector); + w[29] = __byte_perm (w[10], w[11], selector); + w[28] = __byte_perm (w[ 9], w[10], selector); + w[27] = __byte_perm (w[ 8], w[ 9], selector); + w[26] = __byte_perm (w[ 7], w[ 8], selector); + w[25] = __byte_perm (w[ 6], w[ 7], selector); + w[24] = __byte_perm (w[ 5], w[ 6], selector); + w[23] = __byte_perm (w[ 4], w[ 5], selector); + w[22] = __byte_perm (w[ 3], w[ 4], selector); + w[21] = __byte_perm (w[ 2], w[ 3], selector); + w[20] = __byte_perm (w[ 1], w[ 2], selector); + w[19] = __byte_perm (w[ 0], w[ 1], selector); + w[18] = __byte_perm ( 0, w[ 0], selector); + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 19: + w[63] = __byte_perm (w[43], w[44], selector); + w[62] = __byte_perm (w[42], w[43], selector); + w[61] = __byte_perm (w[41], w[42], selector); + w[60] = __byte_perm (w[40], w[41], selector); + w[59] = __byte_perm (w[39], w[40], selector); + w[58] = __byte_perm (w[38], w[39], selector); + w[57] = __byte_perm (w[37], w[38], selector); + w[56] = __byte_perm (w[36], w[37], selector); + w[55] = __byte_perm (w[35], w[36], selector); + w[54] = __byte_perm (w[34], w[35], selector); + w[53] = __byte_perm (w[33], w[34], selector); + w[52] = __byte_perm (w[32], w[33], selector); + w[51] = __byte_perm (w[31], w[32], selector); + w[50] = __byte_perm (w[30], w[31], selector); + w[49] = __byte_perm (w[29], w[30], selector); + w[48] = __byte_perm (w[28], w[29], selector); + w[47] = __byte_perm (w[27], w[28], selector); + w[46] = __byte_perm (w[26], w[27], selector); + w[45] = __byte_perm (w[25], w[26], selector); + w[44] = __byte_perm (w[24], w[25], selector); + w[43] = __byte_perm (w[23], w[24], selector); + w[42] = __byte_perm (w[22], w[23], selector); + w[41] = __byte_perm (w[21], w[22], selector); + w[40] = __byte_perm (w[20], w[21], selector); + w[39] = __byte_perm (w[19], w[20], selector); + w[38] = __byte_perm (w[18], w[19], selector); + w[37] = __byte_perm (w[17], w[18], selector); + w[36] = __byte_perm (w[16], w[17], selector); + w[35] = __byte_perm (w[15], w[16], selector); + w[34] = __byte_perm (w[14], w[15], selector); + w[33] = __byte_perm (w[13], w[14], selector); + w[32] = __byte_perm (w[12], w[13], selector); + w[31] = __byte_perm (w[11], w[12], selector); + w[30] = __byte_perm (w[10], w[11], selector); + w[29] = __byte_perm (w[ 9], w[10], selector); + w[28] = __byte_perm (w[ 8], w[ 9], selector); + w[27] = __byte_perm (w[ 7], w[ 8], selector); + w[26] = __byte_perm (w[ 6], w[ 7], selector); + w[25] = __byte_perm (w[ 5], w[ 6], selector); + w[24] = __byte_perm (w[ 4], w[ 5], selector); + w[23] = __byte_perm (w[ 3], w[ 4], selector); + w[22] = __byte_perm (w[ 2], w[ 3], selector); + w[21] = __byte_perm (w[ 1], w[ 2], selector); + w[20] = __byte_perm (w[ 0], w[ 1], selector); + w[19] = __byte_perm ( 0, w[ 0], selector); + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 20: + w[63] = __byte_perm (w[42], w[43], selector); + w[62] = __byte_perm (w[41], w[42], selector); + w[61] = __byte_perm (w[40], w[41], selector); + w[60] = __byte_perm (w[39], w[40], selector); + w[59] = __byte_perm (w[38], w[39], selector); + w[58] = __byte_perm (w[37], w[38], selector); + w[57] = __byte_perm (w[36], w[37], selector); + w[56] = __byte_perm (w[35], w[36], selector); + w[55] = __byte_perm (w[34], w[35], selector); + w[54] = __byte_perm (w[33], w[34], selector); + w[53] = __byte_perm (w[32], w[33], selector); + w[52] = __byte_perm (w[31], w[32], selector); + w[51] = __byte_perm (w[30], w[31], selector); + w[50] = __byte_perm (w[29], w[30], selector); + w[49] = __byte_perm (w[28], w[29], selector); + w[48] = __byte_perm (w[27], w[28], selector); + w[47] = __byte_perm (w[26], w[27], selector); + w[46] = __byte_perm (w[25], w[26], selector); + w[45] = __byte_perm (w[24], w[25], selector); + w[44] = __byte_perm (w[23], w[24], selector); + w[43] = __byte_perm (w[22], w[23], selector); + w[42] = __byte_perm (w[21], w[22], selector); + w[41] = __byte_perm (w[20], w[21], selector); + w[40] = __byte_perm (w[19], w[20], selector); + w[39] = __byte_perm (w[18], w[19], selector); + w[38] = __byte_perm (w[17], w[18], selector); + w[37] = __byte_perm (w[16], w[17], selector); + w[36] = __byte_perm (w[15], w[16], selector); + w[35] = __byte_perm (w[14], w[15], selector); + w[34] = __byte_perm (w[13], w[14], selector); + w[33] = __byte_perm (w[12], w[13], selector); + w[32] = __byte_perm (w[11], w[12], selector); + w[31] = __byte_perm (w[10], w[11], selector); + w[30] = __byte_perm (w[ 9], w[10], selector); + w[29] = __byte_perm (w[ 8], w[ 9], selector); + w[28] = __byte_perm (w[ 7], w[ 8], selector); + w[27] = __byte_perm (w[ 6], w[ 7], selector); + w[26] = __byte_perm (w[ 5], w[ 6], selector); + w[25] = __byte_perm (w[ 4], w[ 5], selector); + w[24] = __byte_perm (w[ 3], w[ 4], selector); + w[23] = __byte_perm (w[ 2], w[ 3], selector); + w[22] = __byte_perm (w[ 1], w[ 2], selector); + w[21] = __byte_perm (w[ 0], w[ 1], selector); + w[20] = __byte_perm ( 0, w[ 0], selector); + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 21: + w[63] = __byte_perm (w[41], w[42], selector); + w[62] = __byte_perm (w[40], w[41], selector); + w[61] = __byte_perm (w[39], w[40], selector); + w[60] = __byte_perm (w[38], w[39], selector); + w[59] = __byte_perm (w[37], w[38], selector); + w[58] = __byte_perm (w[36], w[37], selector); + w[57] = __byte_perm (w[35], w[36], selector); + w[56] = __byte_perm (w[34], w[35], selector); + w[55] = __byte_perm (w[33], w[34], selector); + w[54] = __byte_perm (w[32], w[33], selector); + w[53] = __byte_perm (w[31], w[32], selector); + w[52] = __byte_perm (w[30], w[31], selector); + w[51] = __byte_perm (w[29], w[30], selector); + w[50] = __byte_perm (w[28], w[29], selector); + w[49] = __byte_perm (w[27], w[28], selector); + w[48] = __byte_perm (w[26], w[27], selector); + w[47] = __byte_perm (w[25], w[26], selector); + w[46] = __byte_perm (w[24], w[25], selector); + w[45] = __byte_perm (w[23], w[24], selector); + w[44] = __byte_perm (w[22], w[23], selector); + w[43] = __byte_perm (w[21], w[22], selector); + w[42] = __byte_perm (w[20], w[21], selector); + w[41] = __byte_perm (w[19], w[20], selector); + w[40] = __byte_perm (w[18], w[19], selector); + w[39] = __byte_perm (w[17], w[18], selector); + w[38] = __byte_perm (w[16], w[17], selector); + w[37] = __byte_perm (w[15], w[16], selector); + w[36] = __byte_perm (w[14], w[15], selector); + w[35] = __byte_perm (w[13], w[14], selector); + w[34] = __byte_perm (w[12], w[13], selector); + w[33] = __byte_perm (w[11], w[12], selector); + w[32] = __byte_perm (w[10], w[11], selector); + w[31] = __byte_perm (w[ 9], w[10], selector); + w[30] = __byte_perm (w[ 8], w[ 9], selector); + w[29] = __byte_perm (w[ 7], w[ 8], selector); + w[28] = __byte_perm (w[ 6], w[ 7], selector); + w[27] = __byte_perm (w[ 5], w[ 6], selector); + w[26] = __byte_perm (w[ 4], w[ 5], selector); + w[25] = __byte_perm (w[ 3], w[ 4], selector); + w[24] = __byte_perm (w[ 2], w[ 3], selector); + w[23] = __byte_perm (w[ 1], w[ 2], selector); + w[22] = __byte_perm (w[ 0], w[ 1], selector); + w[21] = __byte_perm ( 0, w[ 0], selector); + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 22: + w[63] = __byte_perm (w[40], w[41], selector); + w[62] = __byte_perm (w[39], w[40], selector); + w[61] = __byte_perm (w[38], w[39], selector); + w[60] = __byte_perm (w[37], w[38], selector); + w[59] = __byte_perm (w[36], w[37], selector); + w[58] = __byte_perm (w[35], w[36], selector); + w[57] = __byte_perm (w[34], w[35], selector); + w[56] = __byte_perm (w[33], w[34], selector); + w[55] = __byte_perm (w[32], w[33], selector); + w[54] = __byte_perm (w[31], w[32], selector); + w[53] = __byte_perm (w[30], w[31], selector); + w[52] = __byte_perm (w[29], w[30], selector); + w[51] = __byte_perm (w[28], w[29], selector); + w[50] = __byte_perm (w[27], w[28], selector); + w[49] = __byte_perm (w[26], w[27], selector); + w[48] = __byte_perm (w[25], w[26], selector); + w[47] = __byte_perm (w[24], w[25], selector); + w[46] = __byte_perm (w[23], w[24], selector); + w[45] = __byte_perm (w[22], w[23], selector); + w[44] = __byte_perm (w[21], w[22], selector); + w[43] = __byte_perm (w[20], w[21], selector); + w[42] = __byte_perm (w[19], w[20], selector); + w[41] = __byte_perm (w[18], w[19], selector); + w[40] = __byte_perm (w[17], w[18], selector); + w[39] = __byte_perm (w[16], w[17], selector); + w[38] = __byte_perm (w[15], w[16], selector); + w[37] = __byte_perm (w[14], w[15], selector); + w[36] = __byte_perm (w[13], w[14], selector); + w[35] = __byte_perm (w[12], w[13], selector); + w[34] = __byte_perm (w[11], w[12], selector); + w[33] = __byte_perm (w[10], w[11], selector); + w[32] = __byte_perm (w[ 9], w[10], selector); + w[31] = __byte_perm (w[ 8], w[ 9], selector); + w[30] = __byte_perm (w[ 7], w[ 8], selector); + w[29] = __byte_perm (w[ 6], w[ 7], selector); + w[28] = __byte_perm (w[ 5], w[ 6], selector); + w[27] = __byte_perm (w[ 4], w[ 5], selector); + w[26] = __byte_perm (w[ 3], w[ 4], selector); + w[25] = __byte_perm (w[ 2], w[ 3], selector); + w[24] = __byte_perm (w[ 1], w[ 2], selector); + w[23] = __byte_perm (w[ 0], w[ 1], selector); + w[22] = __byte_perm ( 0, w[ 0], selector); + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 23: + w[63] = __byte_perm (w[39], w[40], selector); + w[62] = __byte_perm (w[38], w[39], selector); + w[61] = __byte_perm (w[37], w[38], selector); + w[60] = __byte_perm (w[36], w[37], selector); + w[59] = __byte_perm (w[35], w[36], selector); + w[58] = __byte_perm (w[34], w[35], selector); + w[57] = __byte_perm (w[33], w[34], selector); + w[56] = __byte_perm (w[32], w[33], selector); + w[55] = __byte_perm (w[31], w[32], selector); + w[54] = __byte_perm (w[30], w[31], selector); + w[53] = __byte_perm (w[29], w[30], selector); + w[52] = __byte_perm (w[28], w[29], selector); + w[51] = __byte_perm (w[27], w[28], selector); + w[50] = __byte_perm (w[26], w[27], selector); + w[49] = __byte_perm (w[25], w[26], selector); + w[48] = __byte_perm (w[24], w[25], selector); + w[47] = __byte_perm (w[23], w[24], selector); + w[46] = __byte_perm (w[22], w[23], selector); + w[45] = __byte_perm (w[21], w[22], selector); + w[44] = __byte_perm (w[20], w[21], selector); + w[43] = __byte_perm (w[19], w[20], selector); + w[42] = __byte_perm (w[18], w[19], selector); + w[41] = __byte_perm (w[17], w[18], selector); + w[40] = __byte_perm (w[16], w[17], selector); + w[39] = __byte_perm (w[15], w[16], selector); + w[38] = __byte_perm (w[14], w[15], selector); + w[37] = __byte_perm (w[13], w[14], selector); + w[36] = __byte_perm (w[12], w[13], selector); + w[35] = __byte_perm (w[11], w[12], selector); + w[34] = __byte_perm (w[10], w[11], selector); + w[33] = __byte_perm (w[ 9], w[10], selector); + w[32] = __byte_perm (w[ 8], w[ 9], selector); + w[31] = __byte_perm (w[ 7], w[ 8], selector); + w[30] = __byte_perm (w[ 6], w[ 7], selector); + w[29] = __byte_perm (w[ 5], w[ 6], selector); + w[28] = __byte_perm (w[ 4], w[ 5], selector); + w[27] = __byte_perm (w[ 3], w[ 4], selector); + w[26] = __byte_perm (w[ 2], w[ 3], selector); + w[25] = __byte_perm (w[ 1], w[ 2], selector); + w[24] = __byte_perm (w[ 0], w[ 1], selector); + w[23] = __byte_perm ( 0, w[ 0], selector); + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 24: + w[63] = __byte_perm (w[38], w[39], selector); + w[62] = __byte_perm (w[37], w[38], selector); + w[61] = __byte_perm (w[36], w[37], selector); + w[60] = __byte_perm (w[35], w[36], selector); + w[59] = __byte_perm (w[34], w[35], selector); + w[58] = __byte_perm (w[33], w[34], selector); + w[57] = __byte_perm (w[32], w[33], selector); + w[56] = __byte_perm (w[31], w[32], selector); + w[55] = __byte_perm (w[30], w[31], selector); + w[54] = __byte_perm (w[29], w[30], selector); + w[53] = __byte_perm (w[28], w[29], selector); + w[52] = __byte_perm (w[27], w[28], selector); + w[51] = __byte_perm (w[26], w[27], selector); + w[50] = __byte_perm (w[25], w[26], selector); + w[49] = __byte_perm (w[24], w[25], selector); + w[48] = __byte_perm (w[23], w[24], selector); + w[47] = __byte_perm (w[22], w[23], selector); + w[46] = __byte_perm (w[21], w[22], selector); + w[45] = __byte_perm (w[20], w[21], selector); + w[44] = __byte_perm (w[19], w[20], selector); + w[43] = __byte_perm (w[18], w[19], selector); + w[42] = __byte_perm (w[17], w[18], selector); + w[41] = __byte_perm (w[16], w[17], selector); + w[40] = __byte_perm (w[15], w[16], selector); + w[39] = __byte_perm (w[14], w[15], selector); + w[38] = __byte_perm (w[13], w[14], selector); + w[37] = __byte_perm (w[12], w[13], selector); + w[36] = __byte_perm (w[11], w[12], selector); + w[35] = __byte_perm (w[10], w[11], selector); + w[34] = __byte_perm (w[ 9], w[10], selector); + w[33] = __byte_perm (w[ 8], w[ 9], selector); + w[32] = __byte_perm (w[ 7], w[ 8], selector); + w[31] = __byte_perm (w[ 6], w[ 7], selector); + w[30] = __byte_perm (w[ 5], w[ 6], selector); + w[29] = __byte_perm (w[ 4], w[ 5], selector); + w[28] = __byte_perm (w[ 3], w[ 4], selector); + w[27] = __byte_perm (w[ 2], w[ 3], selector); + w[26] = __byte_perm (w[ 1], w[ 2], selector); + w[25] = __byte_perm (w[ 0], w[ 1], selector); + w[24] = __byte_perm ( 0, w[ 0], selector); + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 25: + w[63] = __byte_perm (w[37], w[38], selector); + w[62] = __byte_perm (w[36], w[37], selector); + w[61] = __byte_perm (w[35], w[36], selector); + w[60] = __byte_perm (w[34], w[35], selector); + w[59] = __byte_perm (w[33], w[34], selector); + w[58] = __byte_perm (w[32], w[33], selector); + w[57] = __byte_perm (w[31], w[32], selector); + w[56] = __byte_perm (w[30], w[31], selector); + w[55] = __byte_perm (w[29], w[30], selector); + w[54] = __byte_perm (w[28], w[29], selector); + w[53] = __byte_perm (w[27], w[28], selector); + w[52] = __byte_perm (w[26], w[27], selector); + w[51] = __byte_perm (w[25], w[26], selector); + w[50] = __byte_perm (w[24], w[25], selector); + w[49] = __byte_perm (w[23], w[24], selector); + w[48] = __byte_perm (w[22], w[23], selector); + w[47] = __byte_perm (w[21], w[22], selector); + w[46] = __byte_perm (w[20], w[21], selector); + w[45] = __byte_perm (w[19], w[20], selector); + w[44] = __byte_perm (w[18], w[19], selector); + w[43] = __byte_perm (w[17], w[18], selector); + w[42] = __byte_perm (w[16], w[17], selector); + w[41] = __byte_perm (w[15], w[16], selector); + w[40] = __byte_perm (w[14], w[15], selector); + w[39] = __byte_perm (w[13], w[14], selector); + w[38] = __byte_perm (w[12], w[13], selector); + w[37] = __byte_perm (w[11], w[12], selector); + w[36] = __byte_perm (w[10], w[11], selector); + w[35] = __byte_perm (w[ 9], w[10], selector); + w[34] = __byte_perm (w[ 8], w[ 9], selector); + w[33] = __byte_perm (w[ 7], w[ 8], selector); + w[32] = __byte_perm (w[ 6], w[ 7], selector); + w[31] = __byte_perm (w[ 5], w[ 6], selector); + w[30] = __byte_perm (w[ 4], w[ 5], selector); + w[29] = __byte_perm (w[ 3], w[ 4], selector); + w[28] = __byte_perm (w[ 2], w[ 3], selector); + w[27] = __byte_perm (w[ 1], w[ 2], selector); + w[26] = __byte_perm (w[ 0], w[ 1], selector); + w[25] = __byte_perm ( 0, w[ 0], selector); + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 26: + w[63] = __byte_perm (w[36], w[37], selector); + w[62] = __byte_perm (w[35], w[36], selector); + w[61] = __byte_perm (w[34], w[35], selector); + w[60] = __byte_perm (w[33], w[34], selector); + w[59] = __byte_perm (w[32], w[33], selector); + w[58] = __byte_perm (w[31], w[32], selector); + w[57] = __byte_perm (w[30], w[31], selector); + w[56] = __byte_perm (w[29], w[30], selector); + w[55] = __byte_perm (w[28], w[29], selector); + w[54] = __byte_perm (w[27], w[28], selector); + w[53] = __byte_perm (w[26], w[27], selector); + w[52] = __byte_perm (w[25], w[26], selector); + w[51] = __byte_perm (w[24], w[25], selector); + w[50] = __byte_perm (w[23], w[24], selector); + w[49] = __byte_perm (w[22], w[23], selector); + w[48] = __byte_perm (w[21], w[22], selector); + w[47] = __byte_perm (w[20], w[21], selector); + w[46] = __byte_perm (w[19], w[20], selector); + w[45] = __byte_perm (w[18], w[19], selector); + w[44] = __byte_perm (w[17], w[18], selector); + w[43] = __byte_perm (w[16], w[17], selector); + w[42] = __byte_perm (w[15], w[16], selector); + w[41] = __byte_perm (w[14], w[15], selector); + w[40] = __byte_perm (w[13], w[14], selector); + w[39] = __byte_perm (w[12], w[13], selector); + w[38] = __byte_perm (w[11], w[12], selector); + w[37] = __byte_perm (w[10], w[11], selector); + w[36] = __byte_perm (w[ 9], w[10], selector); + w[35] = __byte_perm (w[ 8], w[ 9], selector); + w[34] = __byte_perm (w[ 7], w[ 8], selector); + w[33] = __byte_perm (w[ 6], w[ 7], selector); + w[32] = __byte_perm (w[ 5], w[ 6], selector); + w[31] = __byte_perm (w[ 4], w[ 5], selector); + w[30] = __byte_perm (w[ 3], w[ 4], selector); + w[29] = __byte_perm (w[ 2], w[ 3], selector); + w[28] = __byte_perm (w[ 1], w[ 2], selector); + w[27] = __byte_perm (w[ 0], w[ 1], selector); + w[26] = __byte_perm ( 0, w[ 0], selector); + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 27: + w[63] = __byte_perm (w[35], w[36], selector); + w[62] = __byte_perm (w[34], w[35], selector); + w[61] = __byte_perm (w[33], w[34], selector); + w[60] = __byte_perm (w[32], w[33], selector); + w[59] = __byte_perm (w[31], w[32], selector); + w[58] = __byte_perm (w[30], w[31], selector); + w[57] = __byte_perm (w[29], w[30], selector); + w[56] = __byte_perm (w[28], w[29], selector); + w[55] = __byte_perm (w[27], w[28], selector); + w[54] = __byte_perm (w[26], w[27], selector); + w[53] = __byte_perm (w[25], w[26], selector); + w[52] = __byte_perm (w[24], w[25], selector); + w[51] = __byte_perm (w[23], w[24], selector); + w[50] = __byte_perm (w[22], w[23], selector); + w[49] = __byte_perm (w[21], w[22], selector); + w[48] = __byte_perm (w[20], w[21], selector); + w[47] = __byte_perm (w[19], w[20], selector); + w[46] = __byte_perm (w[18], w[19], selector); + w[45] = __byte_perm (w[17], w[18], selector); + w[44] = __byte_perm (w[16], w[17], selector); + w[43] = __byte_perm (w[15], w[16], selector); + w[42] = __byte_perm (w[14], w[15], selector); + w[41] = __byte_perm (w[13], w[14], selector); + w[40] = __byte_perm (w[12], w[13], selector); + w[39] = __byte_perm (w[11], w[12], selector); + w[38] = __byte_perm (w[10], w[11], selector); + w[37] = __byte_perm (w[ 9], w[10], selector); + w[36] = __byte_perm (w[ 8], w[ 9], selector); + w[35] = __byte_perm (w[ 7], w[ 8], selector); + w[34] = __byte_perm (w[ 6], w[ 7], selector); + w[33] = __byte_perm (w[ 5], w[ 6], selector); + w[32] = __byte_perm (w[ 4], w[ 5], selector); + w[31] = __byte_perm (w[ 3], w[ 4], selector); + w[30] = __byte_perm (w[ 2], w[ 3], selector); + w[29] = __byte_perm (w[ 1], w[ 2], selector); + w[28] = __byte_perm (w[ 0], w[ 1], selector); + w[27] = __byte_perm ( 0, w[ 0], selector); + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 28: + w[63] = __byte_perm (w[34], w[35], selector); + w[62] = __byte_perm (w[33], w[34], selector); + w[61] = __byte_perm (w[32], w[33], selector); + w[60] = __byte_perm (w[31], w[32], selector); + w[59] = __byte_perm (w[30], w[31], selector); + w[58] = __byte_perm (w[29], w[30], selector); + w[57] = __byte_perm (w[28], w[29], selector); + w[56] = __byte_perm (w[27], w[28], selector); + w[55] = __byte_perm (w[26], w[27], selector); + w[54] = __byte_perm (w[25], w[26], selector); + w[53] = __byte_perm (w[24], w[25], selector); + w[52] = __byte_perm (w[23], w[24], selector); + w[51] = __byte_perm (w[22], w[23], selector); + w[50] = __byte_perm (w[21], w[22], selector); + w[49] = __byte_perm (w[20], w[21], selector); + w[48] = __byte_perm (w[19], w[20], selector); + w[47] = __byte_perm (w[18], w[19], selector); + w[46] = __byte_perm (w[17], w[18], selector); + w[45] = __byte_perm (w[16], w[17], selector); + w[44] = __byte_perm (w[15], w[16], selector); + w[43] = __byte_perm (w[14], w[15], selector); + w[42] = __byte_perm (w[13], w[14], selector); + w[41] = __byte_perm (w[12], w[13], selector); + w[40] = __byte_perm (w[11], w[12], selector); + w[39] = __byte_perm (w[10], w[11], selector); + w[38] = __byte_perm (w[ 9], w[10], selector); + w[37] = __byte_perm (w[ 8], w[ 9], selector); + w[36] = __byte_perm (w[ 7], w[ 8], selector); + w[35] = __byte_perm (w[ 6], w[ 7], selector); + w[34] = __byte_perm (w[ 5], w[ 6], selector); + w[33] = __byte_perm (w[ 4], w[ 5], selector); + w[32] = __byte_perm (w[ 3], w[ 4], selector); + w[31] = __byte_perm (w[ 2], w[ 3], selector); + w[30] = __byte_perm (w[ 1], w[ 2], selector); + w[29] = __byte_perm (w[ 0], w[ 1], selector); + w[28] = __byte_perm ( 0, w[ 0], selector); + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 29: + w[63] = __byte_perm (w[33], w[34], selector); + w[62] = __byte_perm (w[32], w[33], selector); + w[61] = __byte_perm (w[31], w[32], selector); + w[60] = __byte_perm (w[30], w[31], selector); + w[59] = __byte_perm (w[29], w[30], selector); + w[58] = __byte_perm (w[28], w[29], selector); + w[57] = __byte_perm (w[27], w[28], selector); + w[56] = __byte_perm (w[26], w[27], selector); + w[55] = __byte_perm (w[25], w[26], selector); + w[54] = __byte_perm (w[24], w[25], selector); + w[53] = __byte_perm (w[23], w[24], selector); + w[52] = __byte_perm (w[22], w[23], selector); + w[51] = __byte_perm (w[21], w[22], selector); + w[50] = __byte_perm (w[20], w[21], selector); + w[49] = __byte_perm (w[19], w[20], selector); + w[48] = __byte_perm (w[18], w[19], selector); + w[47] = __byte_perm (w[17], w[18], selector); + w[46] = __byte_perm (w[16], w[17], selector); + w[45] = __byte_perm (w[15], w[16], selector); + w[44] = __byte_perm (w[14], w[15], selector); + w[43] = __byte_perm (w[13], w[14], selector); + w[42] = __byte_perm (w[12], w[13], selector); + w[41] = __byte_perm (w[11], w[12], selector); + w[40] = __byte_perm (w[10], w[11], selector); + w[39] = __byte_perm (w[ 9], w[10], selector); + w[38] = __byte_perm (w[ 8], w[ 9], selector); + w[37] = __byte_perm (w[ 7], w[ 8], selector); + w[36] = __byte_perm (w[ 6], w[ 7], selector); + w[35] = __byte_perm (w[ 5], w[ 6], selector); + w[34] = __byte_perm (w[ 4], w[ 5], selector); + w[33] = __byte_perm (w[ 3], w[ 4], selector); + w[32] = __byte_perm (w[ 2], w[ 3], selector); + w[31] = __byte_perm (w[ 1], w[ 2], selector); + w[30] = __byte_perm (w[ 0], w[ 1], selector); + w[29] = __byte_perm ( 0, w[ 0], selector); + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 30: + w[63] = __byte_perm (w[32], w[33], selector); + w[62] = __byte_perm (w[31], w[32], selector); + w[61] = __byte_perm (w[30], w[31], selector); + w[60] = __byte_perm (w[29], w[30], selector); + w[59] = __byte_perm (w[28], w[29], selector); + w[58] = __byte_perm (w[27], w[28], selector); + w[57] = __byte_perm (w[26], w[27], selector); + w[56] = __byte_perm (w[25], w[26], selector); + w[55] = __byte_perm (w[24], w[25], selector); + w[54] = __byte_perm (w[23], w[24], selector); + w[53] = __byte_perm (w[22], w[23], selector); + w[52] = __byte_perm (w[21], w[22], selector); + w[51] = __byte_perm (w[20], w[21], selector); + w[50] = __byte_perm (w[19], w[20], selector); + w[49] = __byte_perm (w[18], w[19], selector); + w[48] = __byte_perm (w[17], w[18], selector); + w[47] = __byte_perm (w[16], w[17], selector); + w[46] = __byte_perm (w[15], w[16], selector); + w[45] = __byte_perm (w[14], w[15], selector); + w[44] = __byte_perm (w[13], w[14], selector); + w[43] = __byte_perm (w[12], w[13], selector); + w[42] = __byte_perm (w[11], w[12], selector); + w[41] = __byte_perm (w[10], w[11], selector); + w[40] = __byte_perm (w[ 9], w[10], selector); + w[39] = __byte_perm (w[ 8], w[ 9], selector); + w[38] = __byte_perm (w[ 7], w[ 8], selector); + w[37] = __byte_perm (w[ 6], w[ 7], selector); + w[36] = __byte_perm (w[ 5], w[ 6], selector); + w[35] = __byte_perm (w[ 4], w[ 5], selector); + w[34] = __byte_perm (w[ 3], w[ 4], selector); + w[33] = __byte_perm (w[ 2], w[ 3], selector); + w[32] = __byte_perm (w[ 1], w[ 2], selector); + w[31] = __byte_perm (w[ 0], w[ 1], selector); + w[30] = __byte_perm ( 0, w[ 0], selector); + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 31: + w[63] = __byte_perm (w[31], w[32], selector); + w[62] = __byte_perm (w[30], w[31], selector); + w[61] = __byte_perm (w[29], w[30], selector); + w[60] = __byte_perm (w[28], w[29], selector); + w[59] = __byte_perm (w[27], w[28], selector); + w[58] = __byte_perm (w[26], w[27], selector); + w[57] = __byte_perm (w[25], w[26], selector); + w[56] = __byte_perm (w[24], w[25], selector); + w[55] = __byte_perm (w[23], w[24], selector); + w[54] = __byte_perm (w[22], w[23], selector); + w[53] = __byte_perm (w[21], w[22], selector); + w[52] = __byte_perm (w[20], w[21], selector); + w[51] = __byte_perm (w[19], w[20], selector); + w[50] = __byte_perm (w[18], w[19], selector); + w[49] = __byte_perm (w[17], w[18], selector); + w[48] = __byte_perm (w[16], w[17], selector); + w[47] = __byte_perm (w[15], w[16], selector); + w[46] = __byte_perm (w[14], w[15], selector); + w[45] = __byte_perm (w[13], w[14], selector); + w[44] = __byte_perm (w[12], w[13], selector); + w[43] = __byte_perm (w[11], w[12], selector); + w[42] = __byte_perm (w[10], w[11], selector); + w[41] = __byte_perm (w[ 9], w[10], selector); + w[40] = __byte_perm (w[ 8], w[ 9], selector); + w[39] = __byte_perm (w[ 7], w[ 8], selector); + w[38] = __byte_perm (w[ 6], w[ 7], selector); + w[37] = __byte_perm (w[ 5], w[ 6], selector); + w[36] = __byte_perm (w[ 4], w[ 5], selector); + w[35] = __byte_perm (w[ 3], w[ 4], selector); + w[34] = __byte_perm (w[ 2], w[ 3], selector); + w[33] = __byte_perm (w[ 1], w[ 2], selector); + w[32] = __byte_perm (w[ 0], w[ 1], selector); + w[31] = __byte_perm ( 0, w[ 0], selector); + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 32: + w[63] = __byte_perm (w[30], w[31], selector); + w[62] = __byte_perm (w[29], w[30], selector); + w[61] = __byte_perm (w[28], w[29], selector); + w[60] = __byte_perm (w[27], w[28], selector); + w[59] = __byte_perm (w[26], w[27], selector); + w[58] = __byte_perm (w[25], w[26], selector); + w[57] = __byte_perm (w[24], w[25], selector); + w[56] = __byte_perm (w[23], w[24], selector); + w[55] = __byte_perm (w[22], w[23], selector); + w[54] = __byte_perm (w[21], w[22], selector); + w[53] = __byte_perm (w[20], w[21], selector); + w[52] = __byte_perm (w[19], w[20], selector); + w[51] = __byte_perm (w[18], w[19], selector); + w[50] = __byte_perm (w[17], w[18], selector); + w[49] = __byte_perm (w[16], w[17], selector); + w[48] = __byte_perm (w[15], w[16], selector); + w[47] = __byte_perm (w[14], w[15], selector); + w[46] = __byte_perm (w[13], w[14], selector); + w[45] = __byte_perm (w[12], w[13], selector); + w[44] = __byte_perm (w[11], w[12], selector); + w[43] = __byte_perm (w[10], w[11], selector); + w[42] = __byte_perm (w[ 9], w[10], selector); + w[41] = __byte_perm (w[ 8], w[ 9], selector); + w[40] = __byte_perm (w[ 7], w[ 8], selector); + w[39] = __byte_perm (w[ 6], w[ 7], selector); + w[38] = __byte_perm (w[ 5], w[ 6], selector); + w[37] = __byte_perm (w[ 4], w[ 5], selector); + w[36] = __byte_perm (w[ 3], w[ 4], selector); + w[35] = __byte_perm (w[ 2], w[ 3], selector); + w[34] = __byte_perm (w[ 1], w[ 2], selector); + w[33] = __byte_perm (w[ 0], w[ 1], selector); + w[32] = __byte_perm ( 0, w[ 0], selector); + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 33: + w[63] = __byte_perm (w[29], w[30], selector); + w[62] = __byte_perm (w[28], w[29], selector); + w[61] = __byte_perm (w[27], w[28], selector); + w[60] = __byte_perm (w[26], w[27], selector); + w[59] = __byte_perm (w[25], w[26], selector); + w[58] = __byte_perm (w[24], w[25], selector); + w[57] = __byte_perm (w[23], w[24], selector); + w[56] = __byte_perm (w[22], w[23], selector); + w[55] = __byte_perm (w[21], w[22], selector); + w[54] = __byte_perm (w[20], w[21], selector); + w[53] = __byte_perm (w[19], w[20], selector); + w[52] = __byte_perm (w[18], w[19], selector); + w[51] = __byte_perm (w[17], w[18], selector); + w[50] = __byte_perm (w[16], w[17], selector); + w[49] = __byte_perm (w[15], w[16], selector); + w[48] = __byte_perm (w[14], w[15], selector); + w[47] = __byte_perm (w[13], w[14], selector); + w[46] = __byte_perm (w[12], w[13], selector); + w[45] = __byte_perm (w[11], w[12], selector); + w[44] = __byte_perm (w[10], w[11], selector); + w[43] = __byte_perm (w[ 9], w[10], selector); + w[42] = __byte_perm (w[ 8], w[ 9], selector); + w[41] = __byte_perm (w[ 7], w[ 8], selector); + w[40] = __byte_perm (w[ 6], w[ 7], selector); + w[39] = __byte_perm (w[ 5], w[ 6], selector); + w[38] = __byte_perm (w[ 4], w[ 5], selector); + w[37] = __byte_perm (w[ 3], w[ 4], selector); + w[36] = __byte_perm (w[ 2], w[ 3], selector); + w[35] = __byte_perm (w[ 1], w[ 2], selector); + w[34] = __byte_perm (w[ 0], w[ 1], selector); + w[33] = __byte_perm ( 0, w[ 0], selector); + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 34: + w[63] = __byte_perm (w[28], w[29], selector); + w[62] = __byte_perm (w[27], w[28], selector); + w[61] = __byte_perm (w[26], w[27], selector); + w[60] = __byte_perm (w[25], w[26], selector); + w[59] = __byte_perm (w[24], w[25], selector); + w[58] = __byte_perm (w[23], w[24], selector); + w[57] = __byte_perm (w[22], w[23], selector); + w[56] = __byte_perm (w[21], w[22], selector); + w[55] = __byte_perm (w[20], w[21], selector); + w[54] = __byte_perm (w[19], w[20], selector); + w[53] = __byte_perm (w[18], w[19], selector); + w[52] = __byte_perm (w[17], w[18], selector); + w[51] = __byte_perm (w[16], w[17], selector); + w[50] = __byte_perm (w[15], w[16], selector); + w[49] = __byte_perm (w[14], w[15], selector); + w[48] = __byte_perm (w[13], w[14], selector); + w[47] = __byte_perm (w[12], w[13], selector); + w[46] = __byte_perm (w[11], w[12], selector); + w[45] = __byte_perm (w[10], w[11], selector); + w[44] = __byte_perm (w[ 9], w[10], selector); + w[43] = __byte_perm (w[ 8], w[ 9], selector); + w[42] = __byte_perm (w[ 7], w[ 8], selector); + w[41] = __byte_perm (w[ 6], w[ 7], selector); + w[40] = __byte_perm (w[ 5], w[ 6], selector); + w[39] = __byte_perm (w[ 4], w[ 5], selector); + w[38] = __byte_perm (w[ 3], w[ 4], selector); + w[37] = __byte_perm (w[ 2], w[ 3], selector); + w[36] = __byte_perm (w[ 1], w[ 2], selector); + w[35] = __byte_perm (w[ 0], w[ 1], selector); + w[34] = __byte_perm ( 0, w[ 0], selector); + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 35: + w[63] = __byte_perm (w[27], w[28], selector); + w[62] = __byte_perm (w[26], w[27], selector); + w[61] = __byte_perm (w[25], w[26], selector); + w[60] = __byte_perm (w[24], w[25], selector); + w[59] = __byte_perm (w[23], w[24], selector); + w[58] = __byte_perm (w[22], w[23], selector); + w[57] = __byte_perm (w[21], w[22], selector); + w[56] = __byte_perm (w[20], w[21], selector); + w[55] = __byte_perm (w[19], w[20], selector); + w[54] = __byte_perm (w[18], w[19], selector); + w[53] = __byte_perm (w[17], w[18], selector); + w[52] = __byte_perm (w[16], w[17], selector); + w[51] = __byte_perm (w[15], w[16], selector); + w[50] = __byte_perm (w[14], w[15], selector); + w[49] = __byte_perm (w[13], w[14], selector); + w[48] = __byte_perm (w[12], w[13], selector); + w[47] = __byte_perm (w[11], w[12], selector); + w[46] = __byte_perm (w[10], w[11], selector); + w[45] = __byte_perm (w[ 9], w[10], selector); + w[44] = __byte_perm (w[ 8], w[ 9], selector); + w[43] = __byte_perm (w[ 7], w[ 8], selector); + w[42] = __byte_perm (w[ 6], w[ 7], selector); + w[41] = __byte_perm (w[ 5], w[ 6], selector); + w[40] = __byte_perm (w[ 4], w[ 5], selector); + w[39] = __byte_perm (w[ 3], w[ 4], selector); + w[38] = __byte_perm (w[ 2], w[ 3], selector); + w[37] = __byte_perm (w[ 1], w[ 2], selector); + w[36] = __byte_perm (w[ 0], w[ 1], selector); + w[35] = __byte_perm ( 0, w[ 0], selector); + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 36: + w[63] = __byte_perm (w[26], w[27], selector); + w[62] = __byte_perm (w[25], w[26], selector); + w[61] = __byte_perm (w[24], w[25], selector); + w[60] = __byte_perm (w[23], w[24], selector); + w[59] = __byte_perm (w[22], w[23], selector); + w[58] = __byte_perm (w[21], w[22], selector); + w[57] = __byte_perm (w[20], w[21], selector); + w[56] = __byte_perm (w[19], w[20], selector); + w[55] = __byte_perm (w[18], w[19], selector); + w[54] = __byte_perm (w[17], w[18], selector); + w[53] = __byte_perm (w[16], w[17], selector); + w[52] = __byte_perm (w[15], w[16], selector); + w[51] = __byte_perm (w[14], w[15], selector); + w[50] = __byte_perm (w[13], w[14], selector); + w[49] = __byte_perm (w[12], w[13], selector); + w[48] = __byte_perm (w[11], w[12], selector); + w[47] = __byte_perm (w[10], w[11], selector); + w[46] = __byte_perm (w[ 9], w[10], selector); + w[45] = __byte_perm (w[ 8], w[ 9], selector); + w[44] = __byte_perm (w[ 7], w[ 8], selector); + w[43] = __byte_perm (w[ 6], w[ 7], selector); + w[42] = __byte_perm (w[ 5], w[ 6], selector); + w[41] = __byte_perm (w[ 4], w[ 5], selector); + w[40] = __byte_perm (w[ 3], w[ 4], selector); + w[39] = __byte_perm (w[ 2], w[ 3], selector); + w[38] = __byte_perm (w[ 1], w[ 2], selector); + w[37] = __byte_perm (w[ 0], w[ 1], selector); + w[36] = __byte_perm ( 0, w[ 0], selector); + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 37: + w[63] = __byte_perm (w[25], w[26], selector); + w[62] = __byte_perm (w[24], w[25], selector); + w[61] = __byte_perm (w[23], w[24], selector); + w[60] = __byte_perm (w[22], w[23], selector); + w[59] = __byte_perm (w[21], w[22], selector); + w[58] = __byte_perm (w[20], w[21], selector); + w[57] = __byte_perm (w[19], w[20], selector); + w[56] = __byte_perm (w[18], w[19], selector); + w[55] = __byte_perm (w[17], w[18], selector); + w[54] = __byte_perm (w[16], w[17], selector); + w[53] = __byte_perm (w[15], w[16], selector); + w[52] = __byte_perm (w[14], w[15], selector); + w[51] = __byte_perm (w[13], w[14], selector); + w[50] = __byte_perm (w[12], w[13], selector); + w[49] = __byte_perm (w[11], w[12], selector); + w[48] = __byte_perm (w[10], w[11], selector); + w[47] = __byte_perm (w[ 9], w[10], selector); + w[46] = __byte_perm (w[ 8], w[ 9], selector); + w[45] = __byte_perm (w[ 7], w[ 8], selector); + w[44] = __byte_perm (w[ 6], w[ 7], selector); + w[43] = __byte_perm (w[ 5], w[ 6], selector); + w[42] = __byte_perm (w[ 4], w[ 5], selector); + w[41] = __byte_perm (w[ 3], w[ 4], selector); + w[40] = __byte_perm (w[ 2], w[ 3], selector); + w[39] = __byte_perm (w[ 1], w[ 2], selector); + w[38] = __byte_perm (w[ 0], w[ 1], selector); + w[37] = __byte_perm ( 0, w[ 0], selector); + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 38: + w[63] = __byte_perm (w[24], w[25], selector); + w[62] = __byte_perm (w[23], w[24], selector); + w[61] = __byte_perm (w[22], w[23], selector); + w[60] = __byte_perm (w[21], w[22], selector); + w[59] = __byte_perm (w[20], w[21], selector); + w[58] = __byte_perm (w[19], w[20], selector); + w[57] = __byte_perm (w[18], w[19], selector); + w[56] = __byte_perm (w[17], w[18], selector); + w[55] = __byte_perm (w[16], w[17], selector); + w[54] = __byte_perm (w[15], w[16], selector); + w[53] = __byte_perm (w[14], w[15], selector); + w[52] = __byte_perm (w[13], w[14], selector); + w[51] = __byte_perm (w[12], w[13], selector); + w[50] = __byte_perm (w[11], w[12], selector); + w[49] = __byte_perm (w[10], w[11], selector); + w[48] = __byte_perm (w[ 9], w[10], selector); + w[47] = __byte_perm (w[ 8], w[ 9], selector); + w[46] = __byte_perm (w[ 7], w[ 8], selector); + w[45] = __byte_perm (w[ 6], w[ 7], selector); + w[44] = __byte_perm (w[ 5], w[ 6], selector); + w[43] = __byte_perm (w[ 4], w[ 5], selector); + w[42] = __byte_perm (w[ 3], w[ 4], selector); + w[41] = __byte_perm (w[ 2], w[ 3], selector); + w[40] = __byte_perm (w[ 1], w[ 2], selector); + w[39] = __byte_perm (w[ 0], w[ 1], selector); + w[38] = __byte_perm ( 0, w[ 0], selector); + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 39: + w[63] = __byte_perm (w[23], w[24], selector); + w[62] = __byte_perm (w[22], w[23], selector); + w[61] = __byte_perm (w[21], w[22], selector); + w[60] = __byte_perm (w[20], w[21], selector); + w[59] = __byte_perm (w[19], w[20], selector); + w[58] = __byte_perm (w[18], w[19], selector); + w[57] = __byte_perm (w[17], w[18], selector); + w[56] = __byte_perm (w[16], w[17], selector); + w[55] = __byte_perm (w[15], w[16], selector); + w[54] = __byte_perm (w[14], w[15], selector); + w[53] = __byte_perm (w[13], w[14], selector); + w[52] = __byte_perm (w[12], w[13], selector); + w[51] = __byte_perm (w[11], w[12], selector); + w[50] = __byte_perm (w[10], w[11], selector); + w[49] = __byte_perm (w[ 9], w[10], selector); + w[48] = __byte_perm (w[ 8], w[ 9], selector); + w[47] = __byte_perm (w[ 7], w[ 8], selector); + w[46] = __byte_perm (w[ 6], w[ 7], selector); + w[45] = __byte_perm (w[ 5], w[ 6], selector); + w[44] = __byte_perm (w[ 4], w[ 5], selector); + w[43] = __byte_perm (w[ 3], w[ 4], selector); + w[42] = __byte_perm (w[ 2], w[ 3], selector); + w[41] = __byte_perm (w[ 1], w[ 2], selector); + w[40] = __byte_perm (w[ 0], w[ 1], selector); + w[39] = __byte_perm ( 0, w[ 0], selector); + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 40: + w[63] = __byte_perm (w[22], w[23], selector); + w[62] = __byte_perm (w[21], w[22], selector); + w[61] = __byte_perm (w[20], w[21], selector); + w[60] = __byte_perm (w[19], w[20], selector); + w[59] = __byte_perm (w[18], w[19], selector); + w[58] = __byte_perm (w[17], w[18], selector); + w[57] = __byte_perm (w[16], w[17], selector); + w[56] = __byte_perm (w[15], w[16], selector); + w[55] = __byte_perm (w[14], w[15], selector); + w[54] = __byte_perm (w[13], w[14], selector); + w[53] = __byte_perm (w[12], w[13], selector); + w[52] = __byte_perm (w[11], w[12], selector); + w[51] = __byte_perm (w[10], w[11], selector); + w[50] = __byte_perm (w[ 9], w[10], selector); + w[49] = __byte_perm (w[ 8], w[ 9], selector); + w[48] = __byte_perm (w[ 7], w[ 8], selector); + w[47] = __byte_perm (w[ 6], w[ 7], selector); + w[46] = __byte_perm (w[ 5], w[ 6], selector); + w[45] = __byte_perm (w[ 4], w[ 5], selector); + w[44] = __byte_perm (w[ 3], w[ 4], selector); + w[43] = __byte_perm (w[ 2], w[ 3], selector); + w[42] = __byte_perm (w[ 1], w[ 2], selector); + w[41] = __byte_perm (w[ 0], w[ 1], selector); + w[40] = __byte_perm ( 0, w[ 0], selector); + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 41: + w[63] = __byte_perm (w[21], w[22], selector); + w[62] = __byte_perm (w[20], w[21], selector); + w[61] = __byte_perm (w[19], w[20], selector); + w[60] = __byte_perm (w[18], w[19], selector); + w[59] = __byte_perm (w[17], w[18], selector); + w[58] = __byte_perm (w[16], w[17], selector); + w[57] = __byte_perm (w[15], w[16], selector); + w[56] = __byte_perm (w[14], w[15], selector); + w[55] = __byte_perm (w[13], w[14], selector); + w[54] = __byte_perm (w[12], w[13], selector); + w[53] = __byte_perm (w[11], w[12], selector); + w[52] = __byte_perm (w[10], w[11], selector); + w[51] = __byte_perm (w[ 9], w[10], selector); + w[50] = __byte_perm (w[ 8], w[ 9], selector); + w[49] = __byte_perm (w[ 7], w[ 8], selector); + w[48] = __byte_perm (w[ 6], w[ 7], selector); + w[47] = __byte_perm (w[ 5], w[ 6], selector); + w[46] = __byte_perm (w[ 4], w[ 5], selector); + w[45] = __byte_perm (w[ 3], w[ 4], selector); + w[44] = __byte_perm (w[ 2], w[ 3], selector); + w[43] = __byte_perm (w[ 1], w[ 2], selector); + w[42] = __byte_perm (w[ 0], w[ 1], selector); + w[41] = __byte_perm ( 0, w[ 0], selector); + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 42: + w[63] = __byte_perm (w[20], w[21], selector); + w[62] = __byte_perm (w[19], w[20], selector); + w[61] = __byte_perm (w[18], w[19], selector); + w[60] = __byte_perm (w[17], w[18], selector); + w[59] = __byte_perm (w[16], w[17], selector); + w[58] = __byte_perm (w[15], w[16], selector); + w[57] = __byte_perm (w[14], w[15], selector); + w[56] = __byte_perm (w[13], w[14], selector); + w[55] = __byte_perm (w[12], w[13], selector); + w[54] = __byte_perm (w[11], w[12], selector); + w[53] = __byte_perm (w[10], w[11], selector); + w[52] = __byte_perm (w[ 9], w[10], selector); + w[51] = __byte_perm (w[ 8], w[ 9], selector); + w[50] = __byte_perm (w[ 7], w[ 8], selector); + w[49] = __byte_perm (w[ 6], w[ 7], selector); + w[48] = __byte_perm (w[ 5], w[ 6], selector); + w[47] = __byte_perm (w[ 4], w[ 5], selector); + w[46] = __byte_perm (w[ 3], w[ 4], selector); + w[45] = __byte_perm (w[ 2], w[ 3], selector); + w[44] = __byte_perm (w[ 1], w[ 2], selector); + w[43] = __byte_perm (w[ 0], w[ 1], selector); + w[42] = __byte_perm ( 0, w[ 0], selector); + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 43: + w[63] = __byte_perm (w[19], w[20], selector); + w[62] = __byte_perm (w[18], w[19], selector); + w[61] = __byte_perm (w[17], w[18], selector); + w[60] = __byte_perm (w[16], w[17], selector); + w[59] = __byte_perm (w[15], w[16], selector); + w[58] = __byte_perm (w[14], w[15], selector); + w[57] = __byte_perm (w[13], w[14], selector); + w[56] = __byte_perm (w[12], w[13], selector); + w[55] = __byte_perm (w[11], w[12], selector); + w[54] = __byte_perm (w[10], w[11], selector); + w[53] = __byte_perm (w[ 9], w[10], selector); + w[52] = __byte_perm (w[ 8], w[ 9], selector); + w[51] = __byte_perm (w[ 7], w[ 8], selector); + w[50] = __byte_perm (w[ 6], w[ 7], selector); + w[49] = __byte_perm (w[ 5], w[ 6], selector); + w[48] = __byte_perm (w[ 4], w[ 5], selector); + w[47] = __byte_perm (w[ 3], w[ 4], selector); + w[46] = __byte_perm (w[ 2], w[ 3], selector); + w[45] = __byte_perm (w[ 1], w[ 2], selector); + w[44] = __byte_perm (w[ 0], w[ 1], selector); + w[43] = __byte_perm ( 0, w[ 0], selector); + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 44: + w[63] = __byte_perm (w[18], w[19], selector); + w[62] = __byte_perm (w[17], w[18], selector); + w[61] = __byte_perm (w[16], w[17], selector); + w[60] = __byte_perm (w[15], w[16], selector); + w[59] = __byte_perm (w[14], w[15], selector); + w[58] = __byte_perm (w[13], w[14], selector); + w[57] = __byte_perm (w[12], w[13], selector); + w[56] = __byte_perm (w[11], w[12], selector); + w[55] = __byte_perm (w[10], w[11], selector); + w[54] = __byte_perm (w[ 9], w[10], selector); + w[53] = __byte_perm (w[ 8], w[ 9], selector); + w[52] = __byte_perm (w[ 7], w[ 8], selector); + w[51] = __byte_perm (w[ 6], w[ 7], selector); + w[50] = __byte_perm (w[ 5], w[ 6], selector); + w[49] = __byte_perm (w[ 4], w[ 5], selector); + w[48] = __byte_perm (w[ 3], w[ 4], selector); + w[47] = __byte_perm (w[ 2], w[ 3], selector); + w[46] = __byte_perm (w[ 1], w[ 2], selector); + w[45] = __byte_perm (w[ 0], w[ 1], selector); + w[44] = __byte_perm ( 0, w[ 0], selector); + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 45: + w[63] = __byte_perm (w[17], w[18], selector); + w[62] = __byte_perm (w[16], w[17], selector); + w[61] = __byte_perm (w[15], w[16], selector); + w[60] = __byte_perm (w[14], w[15], selector); + w[59] = __byte_perm (w[13], w[14], selector); + w[58] = __byte_perm (w[12], w[13], selector); + w[57] = __byte_perm (w[11], w[12], selector); + w[56] = __byte_perm (w[10], w[11], selector); + w[55] = __byte_perm (w[ 9], w[10], selector); + w[54] = __byte_perm (w[ 8], w[ 9], selector); + w[53] = __byte_perm (w[ 7], w[ 8], selector); + w[52] = __byte_perm (w[ 6], w[ 7], selector); + w[51] = __byte_perm (w[ 5], w[ 6], selector); + w[50] = __byte_perm (w[ 4], w[ 5], selector); + w[49] = __byte_perm (w[ 3], w[ 4], selector); + w[48] = __byte_perm (w[ 2], w[ 3], selector); + w[47] = __byte_perm (w[ 1], w[ 2], selector); + w[46] = __byte_perm (w[ 0], w[ 1], selector); + w[45] = __byte_perm ( 0, w[ 0], selector); + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 46: + w[63] = __byte_perm (w[16], w[17], selector); + w[62] = __byte_perm (w[15], w[16], selector); + w[61] = __byte_perm (w[14], w[15], selector); + w[60] = __byte_perm (w[13], w[14], selector); + w[59] = __byte_perm (w[12], w[13], selector); + w[58] = __byte_perm (w[11], w[12], selector); + w[57] = __byte_perm (w[10], w[11], selector); + w[56] = __byte_perm (w[ 9], w[10], selector); + w[55] = __byte_perm (w[ 8], w[ 9], selector); + w[54] = __byte_perm (w[ 7], w[ 8], selector); + w[53] = __byte_perm (w[ 6], w[ 7], selector); + w[52] = __byte_perm (w[ 5], w[ 6], selector); + w[51] = __byte_perm (w[ 4], w[ 5], selector); + w[50] = __byte_perm (w[ 3], w[ 4], selector); + w[49] = __byte_perm (w[ 2], w[ 3], selector); + w[48] = __byte_perm (w[ 1], w[ 2], selector); + w[47] = __byte_perm (w[ 0], w[ 1], selector); + w[46] = __byte_perm ( 0, w[ 0], selector); + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 47: + w[63] = __byte_perm (w[15], w[16], selector); + w[62] = __byte_perm (w[14], w[15], selector); + w[61] = __byte_perm (w[13], w[14], selector); + w[60] = __byte_perm (w[12], w[13], selector); + w[59] = __byte_perm (w[11], w[12], selector); + w[58] = __byte_perm (w[10], w[11], selector); + w[57] = __byte_perm (w[ 9], w[10], selector); + w[56] = __byte_perm (w[ 8], w[ 9], selector); + w[55] = __byte_perm (w[ 7], w[ 8], selector); + w[54] = __byte_perm (w[ 6], w[ 7], selector); + w[53] = __byte_perm (w[ 5], w[ 6], selector); + w[52] = __byte_perm (w[ 4], w[ 5], selector); + w[51] = __byte_perm (w[ 3], w[ 4], selector); + w[50] = __byte_perm (w[ 2], w[ 3], selector); + w[49] = __byte_perm (w[ 1], w[ 2], selector); + w[48] = __byte_perm (w[ 0], w[ 1], selector); + w[47] = __byte_perm ( 0, w[ 0], selector); + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 48: + w[63] = __byte_perm (w[14], w[15], selector); + w[62] = __byte_perm (w[13], w[14], selector); + w[61] = __byte_perm (w[12], w[13], selector); + w[60] = __byte_perm (w[11], w[12], selector); + w[59] = __byte_perm (w[10], w[11], selector); + w[58] = __byte_perm (w[ 9], w[10], selector); + w[57] = __byte_perm (w[ 8], w[ 9], selector); + w[56] = __byte_perm (w[ 7], w[ 8], selector); + w[55] = __byte_perm (w[ 6], w[ 7], selector); + w[54] = __byte_perm (w[ 5], w[ 6], selector); + w[53] = __byte_perm (w[ 4], w[ 5], selector); + w[52] = __byte_perm (w[ 3], w[ 4], selector); + w[51] = __byte_perm (w[ 2], w[ 3], selector); + w[50] = __byte_perm (w[ 1], w[ 2], selector); + w[49] = __byte_perm (w[ 0], w[ 1], selector); + w[48] = __byte_perm ( 0, w[ 0], selector); + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 49: + w[63] = __byte_perm (w[13], w[14], selector); + w[62] = __byte_perm (w[12], w[13], selector); + w[61] = __byte_perm (w[11], w[12], selector); + w[60] = __byte_perm (w[10], w[11], selector); + w[59] = __byte_perm (w[ 9], w[10], selector); + w[58] = __byte_perm (w[ 8], w[ 9], selector); + w[57] = __byte_perm (w[ 7], w[ 8], selector); + w[56] = __byte_perm (w[ 6], w[ 7], selector); + w[55] = __byte_perm (w[ 5], w[ 6], selector); + w[54] = __byte_perm (w[ 4], w[ 5], selector); + w[53] = __byte_perm (w[ 3], w[ 4], selector); + w[52] = __byte_perm (w[ 2], w[ 3], selector); + w[51] = __byte_perm (w[ 1], w[ 2], selector); + w[50] = __byte_perm (w[ 0], w[ 1], selector); + w[49] = __byte_perm ( 0, w[ 0], selector); + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 50: + w[63] = __byte_perm (w[12], w[13], selector); + w[62] = __byte_perm (w[11], w[12], selector); + w[61] = __byte_perm (w[10], w[11], selector); + w[60] = __byte_perm (w[ 9], w[10], selector); + w[59] = __byte_perm (w[ 8], w[ 9], selector); + w[58] = __byte_perm (w[ 7], w[ 8], selector); + w[57] = __byte_perm (w[ 6], w[ 7], selector); + w[56] = __byte_perm (w[ 5], w[ 6], selector); + w[55] = __byte_perm (w[ 4], w[ 5], selector); + w[54] = __byte_perm (w[ 3], w[ 4], selector); + w[53] = __byte_perm (w[ 2], w[ 3], selector); + w[52] = __byte_perm (w[ 1], w[ 2], selector); + w[51] = __byte_perm (w[ 0], w[ 1], selector); + w[50] = __byte_perm ( 0, w[ 0], selector); + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 51: + w[63] = __byte_perm (w[11], w[12], selector); + w[62] = __byte_perm (w[10], w[11], selector); + w[61] = __byte_perm (w[ 9], w[10], selector); + w[60] = __byte_perm (w[ 8], w[ 9], selector); + w[59] = __byte_perm (w[ 7], w[ 8], selector); + w[58] = __byte_perm (w[ 6], w[ 7], selector); + w[57] = __byte_perm (w[ 5], w[ 6], selector); + w[56] = __byte_perm (w[ 4], w[ 5], selector); + w[55] = __byte_perm (w[ 3], w[ 4], selector); + w[54] = __byte_perm (w[ 2], w[ 3], selector); + w[53] = __byte_perm (w[ 1], w[ 2], selector); + w[52] = __byte_perm (w[ 0], w[ 1], selector); + w[51] = __byte_perm ( 0, w[ 0], selector); + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 52: + w[63] = __byte_perm (w[10], w[11], selector); + w[62] = __byte_perm (w[ 9], w[10], selector); + w[61] = __byte_perm (w[ 8], w[ 9], selector); + w[60] = __byte_perm (w[ 7], w[ 8], selector); + w[59] = __byte_perm (w[ 6], w[ 7], selector); + w[58] = __byte_perm (w[ 5], w[ 6], selector); + w[57] = __byte_perm (w[ 4], w[ 5], selector); + w[56] = __byte_perm (w[ 3], w[ 4], selector); + w[55] = __byte_perm (w[ 2], w[ 3], selector); + w[54] = __byte_perm (w[ 1], w[ 2], selector); + w[53] = __byte_perm (w[ 0], w[ 1], selector); + w[52] = __byte_perm ( 0, w[ 0], selector); + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 53: + w[63] = __byte_perm (w[ 9], w[10], selector); + w[62] = __byte_perm (w[ 8], w[ 9], selector); + w[61] = __byte_perm (w[ 7], w[ 8], selector); + w[60] = __byte_perm (w[ 6], w[ 7], selector); + w[59] = __byte_perm (w[ 5], w[ 6], selector); + w[58] = __byte_perm (w[ 4], w[ 5], selector); + w[57] = __byte_perm (w[ 3], w[ 4], selector); + w[56] = __byte_perm (w[ 2], w[ 3], selector); + w[55] = __byte_perm (w[ 1], w[ 2], selector); + w[54] = __byte_perm (w[ 0], w[ 1], selector); + w[53] = __byte_perm ( 0, w[ 0], selector); + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 54: + w[63] = __byte_perm (w[ 8], w[ 9], selector); + w[62] = __byte_perm (w[ 7], w[ 8], selector); + w[61] = __byte_perm (w[ 6], w[ 7], selector); + w[60] = __byte_perm (w[ 5], w[ 6], selector); + w[59] = __byte_perm (w[ 4], w[ 5], selector); + w[58] = __byte_perm (w[ 3], w[ 4], selector); + w[57] = __byte_perm (w[ 2], w[ 3], selector); + w[56] = __byte_perm (w[ 1], w[ 2], selector); + w[55] = __byte_perm (w[ 0], w[ 1], selector); + w[54] = __byte_perm ( 0, w[ 0], selector); + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 55: + w[63] = __byte_perm (w[ 7], w[ 8], selector); + w[62] = __byte_perm (w[ 6], w[ 7], selector); + w[61] = __byte_perm (w[ 5], w[ 6], selector); + w[60] = __byte_perm (w[ 4], w[ 5], selector); + w[59] = __byte_perm (w[ 3], w[ 4], selector); + w[58] = __byte_perm (w[ 2], w[ 3], selector); + w[57] = __byte_perm (w[ 1], w[ 2], selector); + w[56] = __byte_perm (w[ 0], w[ 1], selector); + w[55] = __byte_perm ( 0, w[ 0], selector); + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 56: + w[63] = __byte_perm (w[ 6], w[ 7], selector); + w[62] = __byte_perm (w[ 5], w[ 6], selector); + w[61] = __byte_perm (w[ 4], w[ 5], selector); + w[60] = __byte_perm (w[ 3], w[ 4], selector); + w[59] = __byte_perm (w[ 2], w[ 3], selector); + w[58] = __byte_perm (w[ 1], w[ 2], selector); + w[57] = __byte_perm (w[ 0], w[ 1], selector); + w[56] = __byte_perm ( 0, w[ 0], selector); + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 57: + w[63] = __byte_perm (w[ 5], w[ 6], selector); + w[62] = __byte_perm (w[ 4], w[ 5], selector); + w[61] = __byte_perm (w[ 3], w[ 4], selector); + w[60] = __byte_perm (w[ 2], w[ 3], selector); + w[59] = __byte_perm (w[ 1], w[ 2], selector); + w[58] = __byte_perm (w[ 0], w[ 1], selector); + w[57] = __byte_perm ( 0, w[ 0], selector); + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 58: + w[63] = __byte_perm (w[ 4], w[ 5], selector); + w[62] = __byte_perm (w[ 3], w[ 4], selector); + w[61] = __byte_perm (w[ 2], w[ 3], selector); + w[60] = __byte_perm (w[ 1], w[ 2], selector); + w[59] = __byte_perm (w[ 0], w[ 1], selector); + w[58] = __byte_perm ( 0, w[ 0], selector); + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 59: + w[63] = __byte_perm (w[ 3], w[ 4], selector); + w[62] = __byte_perm (w[ 2], w[ 3], selector); + w[61] = __byte_perm (w[ 1], w[ 2], selector); + w[60] = __byte_perm (w[ 0], w[ 1], selector); + w[59] = __byte_perm ( 0, w[ 0], selector); + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 60: + w[63] = __byte_perm (w[ 2], w[ 3], selector); + w[62] = __byte_perm (w[ 1], w[ 2], selector); + w[61] = __byte_perm (w[ 0], w[ 1], selector); + w[60] = __byte_perm ( 0, w[ 0], selector); + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 61: + w[63] = __byte_perm (w[ 1], w[ 2], selector); + w[62] = __byte_perm (w[ 0], w[ 1], selector); + w[61] = __byte_perm ( 0, w[ 0], selector); + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 62: + w[63] = __byte_perm (w[ 0], w[ 1], selector); + w[62] = __byte_perm ( 0, w[ 0], selector); + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 63: + w[63] = __byte_perm ( 0, w[ 0], selector); + w[62] = 0; + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + } + #endif +} + +void switch_buffer_by_offset_1x64_be (u32x w[64], const u32 offset) +{ + #if defined IS_AMD || defined IS_GENERIC + switch (offset / 4) + { + case 0: + w[63] = amd_bytealign (w[62], w[63], offset); + w[62] = amd_bytealign (w[61], w[62], offset); + w[61] = amd_bytealign (w[60], w[61], offset); + w[60] = amd_bytealign (w[59], w[60], offset); + w[59] = amd_bytealign (w[58], w[59], offset); + w[58] = amd_bytealign (w[57], w[58], offset); + w[57] = amd_bytealign (w[56], w[57], offset); + w[56] = amd_bytealign (w[55], w[56], offset); + w[55] = amd_bytealign (w[54], w[55], offset); + w[54] = amd_bytealign (w[53], w[54], offset); + w[53] = amd_bytealign (w[52], w[53], offset); + w[52] = amd_bytealign (w[51], w[52], offset); + w[51] = amd_bytealign (w[50], w[51], offset); + w[50] = amd_bytealign (w[49], w[50], offset); + w[49] = amd_bytealign (w[48], w[49], offset); + w[48] = amd_bytealign (w[47], w[48], offset); + w[47] = amd_bytealign (w[46], w[47], offset); + w[46] = amd_bytealign (w[45], w[46], offset); + w[45] = amd_bytealign (w[44], w[45], offset); + w[44] = amd_bytealign (w[43], w[44], offset); + w[43] = amd_bytealign (w[42], w[43], offset); + w[42] = amd_bytealign (w[41], w[42], offset); + w[41] = amd_bytealign (w[40], w[41], offset); + w[40] = amd_bytealign (w[39], w[40], offset); + w[39] = amd_bytealign (w[38], w[39], offset); + w[38] = amd_bytealign (w[37], w[38], offset); + w[37] = amd_bytealign (w[36], w[37], offset); + w[36] = amd_bytealign (w[35], w[36], offset); + w[35] = amd_bytealign (w[34], w[35], offset); + w[34] = amd_bytealign (w[33], w[34], offset); + w[33] = amd_bytealign (w[32], w[33], offset); + w[32] = amd_bytealign (w[31], w[32], offset); + w[31] = amd_bytealign (w[30], w[31], offset); + w[30] = amd_bytealign (w[29], w[30], offset); + w[29] = amd_bytealign (w[28], w[29], offset); + w[28] = amd_bytealign (w[27], w[28], offset); + w[27] = amd_bytealign (w[26], w[27], offset); + w[26] = amd_bytealign (w[25], w[26], offset); + w[25] = amd_bytealign (w[24], w[25], offset); + w[24] = amd_bytealign (w[23], w[24], offset); + w[23] = amd_bytealign (w[22], w[23], offset); + w[22] = amd_bytealign (w[21], w[22], offset); + w[21] = amd_bytealign (w[20], w[21], offset); + w[20] = amd_bytealign (w[19], w[20], offset); + w[19] = amd_bytealign (w[18], w[19], offset); + w[18] = amd_bytealign (w[17], w[18], offset); + w[17] = amd_bytealign (w[16], w[17], offset); + w[16] = amd_bytealign (w[15], w[16], offset); + w[15] = amd_bytealign (w[14], w[15], offset); + w[14] = amd_bytealign (w[13], w[14], offset); + w[13] = amd_bytealign (w[12], w[13], offset); + w[12] = amd_bytealign (w[11], w[12], offset); + w[11] = amd_bytealign (w[10], w[11], offset); + w[10] = amd_bytealign (w[ 9], w[10], offset); + w[ 9] = amd_bytealign (w[ 8], w[ 9], offset); + w[ 8] = amd_bytealign (w[ 7], w[ 8], offset); + w[ 7] = amd_bytealign (w[ 6], w[ 7], offset); + w[ 6] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 5] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 4] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 3] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 2] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 1] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 0] = amd_bytealign ( 0, w[ 0], offset); + + break; + + case 1: + w[63] = amd_bytealign (w[61], w[62], offset); + w[62] = amd_bytealign (w[60], w[61], offset); + w[61] = amd_bytealign (w[59], w[60], offset); + w[60] = amd_bytealign (w[58], w[59], offset); + w[59] = amd_bytealign (w[57], w[58], offset); + w[58] = amd_bytealign (w[56], w[57], offset); + w[57] = amd_bytealign (w[55], w[56], offset); + w[56] = amd_bytealign (w[54], w[55], offset); + w[55] = amd_bytealign (w[53], w[54], offset); + w[54] = amd_bytealign (w[52], w[53], offset); + w[53] = amd_bytealign (w[51], w[52], offset); + w[52] = amd_bytealign (w[50], w[51], offset); + w[51] = amd_bytealign (w[49], w[50], offset); + w[50] = amd_bytealign (w[48], w[49], offset); + w[49] = amd_bytealign (w[47], w[48], offset); + w[48] = amd_bytealign (w[46], w[47], offset); + w[47] = amd_bytealign (w[45], w[46], offset); + w[46] = amd_bytealign (w[44], w[45], offset); + w[45] = amd_bytealign (w[43], w[44], offset); + w[44] = amd_bytealign (w[42], w[43], offset); + w[43] = amd_bytealign (w[41], w[42], offset); + w[42] = amd_bytealign (w[40], w[41], offset); + w[41] = amd_bytealign (w[39], w[40], offset); + w[40] = amd_bytealign (w[38], w[39], offset); + w[39] = amd_bytealign (w[37], w[38], offset); + w[38] = amd_bytealign (w[36], w[37], offset); + w[37] = amd_bytealign (w[35], w[36], offset); + w[36] = amd_bytealign (w[34], w[35], offset); + w[35] = amd_bytealign (w[33], w[34], offset); + w[34] = amd_bytealign (w[32], w[33], offset); + w[33] = amd_bytealign (w[31], w[32], offset); + w[32] = amd_bytealign (w[30], w[31], offset); + w[31] = amd_bytealign (w[29], w[30], offset); + w[30] = amd_bytealign (w[28], w[29], offset); + w[29] = amd_bytealign (w[27], w[28], offset); + w[28] = amd_bytealign (w[26], w[27], offset); + w[27] = amd_bytealign (w[25], w[26], offset); + w[26] = amd_bytealign (w[24], w[25], offset); + w[25] = amd_bytealign (w[23], w[24], offset); + w[24] = amd_bytealign (w[22], w[23], offset); + w[23] = amd_bytealign (w[21], w[22], offset); + w[22] = amd_bytealign (w[20], w[21], offset); + w[21] = amd_bytealign (w[19], w[20], offset); + w[20] = amd_bytealign (w[18], w[19], offset); + w[19] = amd_bytealign (w[17], w[18], offset); + w[18] = amd_bytealign (w[16], w[17], offset); + w[17] = amd_bytealign (w[15], w[16], offset); + w[16] = amd_bytealign (w[14], w[15], offset); + w[15] = amd_bytealign (w[13], w[14], offset); + w[14] = amd_bytealign (w[12], w[13], offset); + w[13] = amd_bytealign (w[11], w[12], offset); + w[12] = amd_bytealign (w[10], w[11], offset); + w[11] = amd_bytealign (w[ 9], w[10], offset); + w[10] = amd_bytealign (w[ 8], w[ 9], offset); + w[ 9] = amd_bytealign (w[ 7], w[ 8], offset); + w[ 8] = amd_bytealign (w[ 6], w[ 7], offset); + w[ 7] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 6] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 5] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 4] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 3] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 2] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 1] = amd_bytealign ( 0, w[ 0], offset); + w[ 0] = 0; + + break; + + case 2: + w[63] = amd_bytealign (w[60], w[61], offset); + w[62] = amd_bytealign (w[59], w[60], offset); + w[61] = amd_bytealign (w[58], w[59], offset); + w[60] = amd_bytealign (w[57], w[58], offset); + w[59] = amd_bytealign (w[56], w[57], offset); + w[58] = amd_bytealign (w[55], w[56], offset); + w[57] = amd_bytealign (w[54], w[55], offset); + w[56] = amd_bytealign (w[53], w[54], offset); + w[55] = amd_bytealign (w[52], w[53], offset); + w[54] = amd_bytealign (w[51], w[52], offset); + w[53] = amd_bytealign (w[50], w[51], offset); + w[52] = amd_bytealign (w[49], w[50], offset); + w[51] = amd_bytealign (w[48], w[49], offset); + w[50] = amd_bytealign (w[47], w[48], offset); + w[49] = amd_bytealign (w[46], w[47], offset); + w[48] = amd_bytealign (w[45], w[46], offset); + w[47] = amd_bytealign (w[44], w[45], offset); + w[46] = amd_bytealign (w[43], w[44], offset); + w[45] = amd_bytealign (w[42], w[43], offset); + w[44] = amd_bytealign (w[41], w[42], offset); + w[43] = amd_bytealign (w[40], w[41], offset); + w[42] = amd_bytealign (w[39], w[40], offset); + w[41] = amd_bytealign (w[38], w[39], offset); + w[40] = amd_bytealign (w[37], w[38], offset); + w[39] = amd_bytealign (w[36], w[37], offset); + w[38] = amd_bytealign (w[35], w[36], offset); + w[37] = amd_bytealign (w[34], w[35], offset); + w[36] = amd_bytealign (w[33], w[34], offset); + w[35] = amd_bytealign (w[32], w[33], offset); + w[34] = amd_bytealign (w[31], w[32], offset); + w[33] = amd_bytealign (w[30], w[31], offset); + w[32] = amd_bytealign (w[29], w[30], offset); + w[31] = amd_bytealign (w[28], w[29], offset); + w[30] = amd_bytealign (w[27], w[28], offset); + w[29] = amd_bytealign (w[26], w[27], offset); + w[28] = amd_bytealign (w[25], w[26], offset); + w[27] = amd_bytealign (w[24], w[25], offset); + w[26] = amd_bytealign (w[23], w[24], offset); + w[25] = amd_bytealign (w[22], w[23], offset); + w[24] = amd_bytealign (w[21], w[22], offset); + w[23] = amd_bytealign (w[20], w[21], offset); + w[22] = amd_bytealign (w[19], w[20], offset); + w[21] = amd_bytealign (w[18], w[19], offset); + w[20] = amd_bytealign (w[17], w[18], offset); + w[19] = amd_bytealign (w[16], w[17], offset); + w[18] = amd_bytealign (w[15], w[16], offset); + w[17] = amd_bytealign (w[14], w[15], offset); + w[16] = amd_bytealign (w[13], w[14], offset); + w[15] = amd_bytealign (w[12], w[13], offset); + w[14] = amd_bytealign (w[11], w[12], offset); + w[13] = amd_bytealign (w[10], w[11], offset); + w[12] = amd_bytealign (w[ 9], w[10], offset); + w[11] = amd_bytealign (w[ 8], w[ 9], offset); + w[10] = amd_bytealign (w[ 7], w[ 8], offset); + w[ 9] = amd_bytealign (w[ 6], w[ 7], offset); + w[ 8] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 7] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 6] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 5] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 4] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 3] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 2] = amd_bytealign ( 0, w[ 0], offset); + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 3: + w[63] = amd_bytealign (w[59], w[60], offset); + w[62] = amd_bytealign (w[58], w[59], offset); + w[61] = amd_bytealign (w[57], w[58], offset); + w[60] = amd_bytealign (w[56], w[57], offset); + w[59] = amd_bytealign (w[55], w[56], offset); + w[58] = amd_bytealign (w[54], w[55], offset); + w[57] = amd_bytealign (w[53], w[54], offset); + w[56] = amd_bytealign (w[52], w[53], offset); + w[55] = amd_bytealign (w[51], w[52], offset); + w[54] = amd_bytealign (w[50], w[51], offset); + w[53] = amd_bytealign (w[49], w[50], offset); + w[52] = amd_bytealign (w[48], w[49], offset); + w[51] = amd_bytealign (w[47], w[48], offset); + w[50] = amd_bytealign (w[46], w[47], offset); + w[49] = amd_bytealign (w[45], w[46], offset); + w[48] = amd_bytealign (w[44], w[45], offset); + w[47] = amd_bytealign (w[43], w[44], offset); + w[46] = amd_bytealign (w[42], w[43], offset); + w[45] = amd_bytealign (w[41], w[42], offset); + w[44] = amd_bytealign (w[40], w[41], offset); + w[43] = amd_bytealign (w[39], w[40], offset); + w[42] = amd_bytealign (w[38], w[39], offset); + w[41] = amd_bytealign (w[37], w[38], offset); + w[40] = amd_bytealign (w[36], w[37], offset); + w[39] = amd_bytealign (w[35], w[36], offset); + w[38] = amd_bytealign (w[34], w[35], offset); + w[37] = amd_bytealign (w[33], w[34], offset); + w[36] = amd_bytealign (w[32], w[33], offset); + w[35] = amd_bytealign (w[31], w[32], offset); + w[34] = amd_bytealign (w[30], w[31], offset); + w[33] = amd_bytealign (w[29], w[30], offset); + w[32] = amd_bytealign (w[28], w[29], offset); + w[31] = amd_bytealign (w[27], w[28], offset); + w[30] = amd_bytealign (w[26], w[27], offset); + w[29] = amd_bytealign (w[25], w[26], offset); + w[28] = amd_bytealign (w[24], w[25], offset); + w[27] = amd_bytealign (w[23], w[24], offset); + w[26] = amd_bytealign (w[22], w[23], offset); + w[25] = amd_bytealign (w[21], w[22], offset); + w[24] = amd_bytealign (w[20], w[21], offset); + w[23] = amd_bytealign (w[19], w[20], offset); + w[22] = amd_bytealign (w[18], w[19], offset); + w[21] = amd_bytealign (w[17], w[18], offset); + w[20] = amd_bytealign (w[16], w[17], offset); + w[19] = amd_bytealign (w[15], w[16], offset); + w[18] = amd_bytealign (w[14], w[15], offset); + w[17] = amd_bytealign (w[13], w[14], offset); + w[16] = amd_bytealign (w[12], w[13], offset); + w[15] = amd_bytealign (w[11], w[12], offset); + w[14] = amd_bytealign (w[10], w[11], offset); + w[13] = amd_bytealign (w[ 9], w[10], offset); + w[12] = amd_bytealign (w[ 8], w[ 9], offset); + w[11] = amd_bytealign (w[ 7], w[ 8], offset); + w[10] = amd_bytealign (w[ 6], w[ 7], offset); + w[ 9] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 8] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 7] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 6] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 5] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 4] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 3] = amd_bytealign ( 0, w[ 0], offset); + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 4: + w[63] = amd_bytealign (w[58], w[59], offset); + w[62] = amd_bytealign (w[57], w[58], offset); + w[61] = amd_bytealign (w[56], w[57], offset); + w[60] = amd_bytealign (w[55], w[56], offset); + w[59] = amd_bytealign (w[54], w[55], offset); + w[58] = amd_bytealign (w[53], w[54], offset); + w[57] = amd_bytealign (w[52], w[53], offset); + w[56] = amd_bytealign (w[51], w[52], offset); + w[55] = amd_bytealign (w[50], w[51], offset); + w[54] = amd_bytealign (w[49], w[50], offset); + w[53] = amd_bytealign (w[48], w[49], offset); + w[52] = amd_bytealign (w[47], w[48], offset); + w[51] = amd_bytealign (w[46], w[47], offset); + w[50] = amd_bytealign (w[45], w[46], offset); + w[49] = amd_bytealign (w[44], w[45], offset); + w[48] = amd_bytealign (w[43], w[44], offset); + w[47] = amd_bytealign (w[42], w[43], offset); + w[46] = amd_bytealign (w[41], w[42], offset); + w[45] = amd_bytealign (w[40], w[41], offset); + w[44] = amd_bytealign (w[39], w[40], offset); + w[43] = amd_bytealign (w[38], w[39], offset); + w[42] = amd_bytealign (w[37], w[38], offset); + w[41] = amd_bytealign (w[36], w[37], offset); + w[40] = amd_bytealign (w[35], w[36], offset); + w[39] = amd_bytealign (w[34], w[35], offset); + w[38] = amd_bytealign (w[33], w[34], offset); + w[37] = amd_bytealign (w[32], w[33], offset); + w[36] = amd_bytealign (w[31], w[32], offset); + w[35] = amd_bytealign (w[30], w[31], offset); + w[34] = amd_bytealign (w[29], w[30], offset); + w[33] = amd_bytealign (w[28], w[29], offset); + w[32] = amd_bytealign (w[27], w[28], offset); + w[31] = amd_bytealign (w[26], w[27], offset); + w[30] = amd_bytealign (w[25], w[26], offset); + w[29] = amd_bytealign (w[24], w[25], offset); + w[28] = amd_bytealign (w[23], w[24], offset); + w[27] = amd_bytealign (w[22], w[23], offset); + w[26] = amd_bytealign (w[21], w[22], offset); + w[25] = amd_bytealign (w[20], w[21], offset); + w[24] = amd_bytealign (w[19], w[20], offset); + w[23] = amd_bytealign (w[18], w[19], offset); + w[22] = amd_bytealign (w[17], w[18], offset); + w[21] = amd_bytealign (w[16], w[17], offset); + w[20] = amd_bytealign (w[15], w[16], offset); + w[19] = amd_bytealign (w[14], w[15], offset); + w[18] = amd_bytealign (w[13], w[14], offset); + w[17] = amd_bytealign (w[12], w[13], offset); + w[16] = amd_bytealign (w[11], w[12], offset); + w[15] = amd_bytealign (w[10], w[11], offset); + w[14] = amd_bytealign (w[ 9], w[10], offset); + w[13] = amd_bytealign (w[ 8], w[ 9], offset); + w[12] = amd_bytealign (w[ 7], w[ 8], offset); + w[11] = amd_bytealign (w[ 6], w[ 7], offset); + w[10] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 9] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 8] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 7] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 6] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 5] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 4] = amd_bytealign ( 0, w[ 0], offset); + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 5: + w[63] = amd_bytealign (w[57], w[58], offset); + w[62] = amd_bytealign (w[56], w[57], offset); + w[61] = amd_bytealign (w[55], w[56], offset); + w[60] = amd_bytealign (w[54], w[55], offset); + w[59] = amd_bytealign (w[53], w[54], offset); + w[58] = amd_bytealign (w[52], w[53], offset); + w[57] = amd_bytealign (w[51], w[52], offset); + w[56] = amd_bytealign (w[50], w[51], offset); + w[55] = amd_bytealign (w[49], w[50], offset); + w[54] = amd_bytealign (w[48], w[49], offset); + w[53] = amd_bytealign (w[47], w[48], offset); + w[52] = amd_bytealign (w[46], w[47], offset); + w[51] = amd_bytealign (w[45], w[46], offset); + w[50] = amd_bytealign (w[44], w[45], offset); + w[49] = amd_bytealign (w[43], w[44], offset); + w[48] = amd_bytealign (w[42], w[43], offset); + w[47] = amd_bytealign (w[41], w[42], offset); + w[46] = amd_bytealign (w[40], w[41], offset); + w[45] = amd_bytealign (w[39], w[40], offset); + w[44] = amd_bytealign (w[38], w[39], offset); + w[43] = amd_bytealign (w[37], w[38], offset); + w[42] = amd_bytealign (w[36], w[37], offset); + w[41] = amd_bytealign (w[35], w[36], offset); + w[40] = amd_bytealign (w[34], w[35], offset); + w[39] = amd_bytealign (w[33], w[34], offset); + w[38] = amd_bytealign (w[32], w[33], offset); + w[37] = amd_bytealign (w[31], w[32], offset); + w[36] = amd_bytealign (w[30], w[31], offset); + w[35] = amd_bytealign (w[29], w[30], offset); + w[34] = amd_bytealign (w[28], w[29], offset); + w[33] = amd_bytealign (w[27], w[28], offset); + w[32] = amd_bytealign (w[26], w[27], offset); + w[31] = amd_bytealign (w[25], w[26], offset); + w[30] = amd_bytealign (w[24], w[25], offset); + w[29] = amd_bytealign (w[23], w[24], offset); + w[28] = amd_bytealign (w[22], w[23], offset); + w[27] = amd_bytealign (w[21], w[22], offset); + w[26] = amd_bytealign (w[20], w[21], offset); + w[25] = amd_bytealign (w[19], w[20], offset); + w[24] = amd_bytealign (w[18], w[19], offset); + w[23] = amd_bytealign (w[17], w[18], offset); + w[22] = amd_bytealign (w[16], w[17], offset); + w[21] = amd_bytealign (w[15], w[16], offset); + w[20] = amd_bytealign (w[14], w[15], offset); + w[19] = amd_bytealign (w[13], w[14], offset); + w[18] = amd_bytealign (w[12], w[13], offset); + w[17] = amd_bytealign (w[11], w[12], offset); + w[16] = amd_bytealign (w[10], w[11], offset); + w[15] = amd_bytealign (w[ 9], w[10], offset); + w[14] = amd_bytealign (w[ 8], w[ 9], offset); + w[13] = amd_bytealign (w[ 7], w[ 8], offset); + w[12] = amd_bytealign (w[ 6], w[ 7], offset); + w[11] = amd_bytealign (w[ 5], w[ 6], offset); + w[10] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 9] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 8] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 7] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 6] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 5] = amd_bytealign ( 0, w[ 0], offset); + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 6: + w[63] = amd_bytealign (w[56], w[57], offset); + w[62] = amd_bytealign (w[55], w[56], offset); + w[61] = amd_bytealign (w[54], w[55], offset); + w[60] = amd_bytealign (w[53], w[54], offset); + w[59] = amd_bytealign (w[52], w[53], offset); + w[58] = amd_bytealign (w[51], w[52], offset); + w[57] = amd_bytealign (w[50], w[51], offset); + w[56] = amd_bytealign (w[49], w[50], offset); + w[55] = amd_bytealign (w[48], w[49], offset); + w[54] = amd_bytealign (w[47], w[48], offset); + w[53] = amd_bytealign (w[46], w[47], offset); + w[52] = amd_bytealign (w[45], w[46], offset); + w[51] = amd_bytealign (w[44], w[45], offset); + w[50] = amd_bytealign (w[43], w[44], offset); + w[49] = amd_bytealign (w[42], w[43], offset); + w[48] = amd_bytealign (w[41], w[42], offset); + w[47] = amd_bytealign (w[40], w[41], offset); + w[46] = amd_bytealign (w[39], w[40], offset); + w[45] = amd_bytealign (w[38], w[39], offset); + w[44] = amd_bytealign (w[37], w[38], offset); + w[43] = amd_bytealign (w[36], w[37], offset); + w[42] = amd_bytealign (w[35], w[36], offset); + w[41] = amd_bytealign (w[34], w[35], offset); + w[40] = amd_bytealign (w[33], w[34], offset); + w[39] = amd_bytealign (w[32], w[33], offset); + w[38] = amd_bytealign (w[31], w[32], offset); + w[37] = amd_bytealign (w[30], w[31], offset); + w[36] = amd_bytealign (w[29], w[30], offset); + w[35] = amd_bytealign (w[28], w[29], offset); + w[34] = amd_bytealign (w[27], w[28], offset); + w[33] = amd_bytealign (w[26], w[27], offset); + w[32] = amd_bytealign (w[25], w[26], offset); + w[31] = amd_bytealign (w[24], w[25], offset); + w[30] = amd_bytealign (w[23], w[24], offset); + w[29] = amd_bytealign (w[22], w[23], offset); + w[28] = amd_bytealign (w[21], w[22], offset); + w[27] = amd_bytealign (w[20], w[21], offset); + w[26] = amd_bytealign (w[19], w[20], offset); + w[25] = amd_bytealign (w[18], w[19], offset); + w[24] = amd_bytealign (w[17], w[18], offset); + w[23] = amd_bytealign (w[16], w[17], offset); + w[22] = amd_bytealign (w[15], w[16], offset); + w[21] = amd_bytealign (w[14], w[15], offset); + w[20] = amd_bytealign (w[13], w[14], offset); + w[19] = amd_bytealign (w[12], w[13], offset); + w[18] = amd_bytealign (w[11], w[12], offset); + w[17] = amd_bytealign (w[10], w[11], offset); + w[16] = amd_bytealign (w[ 9], w[10], offset); + w[15] = amd_bytealign (w[ 8], w[ 9], offset); + w[14] = amd_bytealign (w[ 7], w[ 8], offset); + w[13] = amd_bytealign (w[ 6], w[ 7], offset); + w[12] = amd_bytealign (w[ 5], w[ 6], offset); + w[11] = amd_bytealign (w[ 4], w[ 5], offset); + w[10] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 9] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 8] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 7] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 6] = amd_bytealign ( 0, w[ 0], offset); + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 7: + w[63] = amd_bytealign (w[55], w[56], offset); + w[62] = amd_bytealign (w[54], w[55], offset); + w[61] = amd_bytealign (w[53], w[54], offset); + w[60] = amd_bytealign (w[52], w[53], offset); + w[59] = amd_bytealign (w[51], w[52], offset); + w[58] = amd_bytealign (w[50], w[51], offset); + w[57] = amd_bytealign (w[49], w[50], offset); + w[56] = amd_bytealign (w[48], w[49], offset); + w[55] = amd_bytealign (w[47], w[48], offset); + w[54] = amd_bytealign (w[46], w[47], offset); + w[53] = amd_bytealign (w[45], w[46], offset); + w[52] = amd_bytealign (w[44], w[45], offset); + w[51] = amd_bytealign (w[43], w[44], offset); + w[50] = amd_bytealign (w[42], w[43], offset); + w[49] = amd_bytealign (w[41], w[42], offset); + w[48] = amd_bytealign (w[40], w[41], offset); + w[47] = amd_bytealign (w[39], w[40], offset); + w[46] = amd_bytealign (w[38], w[39], offset); + w[45] = amd_bytealign (w[37], w[38], offset); + w[44] = amd_bytealign (w[36], w[37], offset); + w[43] = amd_bytealign (w[35], w[36], offset); + w[42] = amd_bytealign (w[34], w[35], offset); + w[41] = amd_bytealign (w[33], w[34], offset); + w[40] = amd_bytealign (w[32], w[33], offset); + w[39] = amd_bytealign (w[31], w[32], offset); + w[38] = amd_bytealign (w[30], w[31], offset); + w[37] = amd_bytealign (w[29], w[30], offset); + w[36] = amd_bytealign (w[28], w[29], offset); + w[35] = amd_bytealign (w[27], w[28], offset); + w[34] = amd_bytealign (w[26], w[27], offset); + w[33] = amd_bytealign (w[25], w[26], offset); + w[32] = amd_bytealign (w[24], w[25], offset); + w[31] = amd_bytealign (w[23], w[24], offset); + w[30] = amd_bytealign (w[22], w[23], offset); + w[29] = amd_bytealign (w[21], w[22], offset); + w[28] = amd_bytealign (w[20], w[21], offset); + w[27] = amd_bytealign (w[19], w[20], offset); + w[26] = amd_bytealign (w[18], w[19], offset); + w[25] = amd_bytealign (w[17], w[18], offset); + w[24] = amd_bytealign (w[16], w[17], offset); + w[23] = amd_bytealign (w[15], w[16], offset); + w[22] = amd_bytealign (w[14], w[15], offset); + w[21] = amd_bytealign (w[13], w[14], offset); + w[20] = amd_bytealign (w[12], w[13], offset); + w[19] = amd_bytealign (w[11], w[12], offset); + w[18] = amd_bytealign (w[10], w[11], offset); + w[17] = amd_bytealign (w[ 9], w[10], offset); + w[16] = amd_bytealign (w[ 8], w[ 9], offset); + w[15] = amd_bytealign (w[ 7], w[ 8], offset); + w[14] = amd_bytealign (w[ 6], w[ 7], offset); + w[13] = amd_bytealign (w[ 5], w[ 6], offset); + w[12] = amd_bytealign (w[ 4], w[ 5], offset); + w[11] = amd_bytealign (w[ 3], w[ 4], offset); + w[10] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 9] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 8] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 7] = amd_bytealign ( 0, w[ 0], offset); + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 8: + w[63] = amd_bytealign (w[54], w[55], offset); + w[62] = amd_bytealign (w[53], w[54], offset); + w[61] = amd_bytealign (w[52], w[53], offset); + w[60] = amd_bytealign (w[51], w[52], offset); + w[59] = amd_bytealign (w[50], w[51], offset); + w[58] = amd_bytealign (w[49], w[50], offset); + w[57] = amd_bytealign (w[48], w[49], offset); + w[56] = amd_bytealign (w[47], w[48], offset); + w[55] = amd_bytealign (w[46], w[47], offset); + w[54] = amd_bytealign (w[45], w[46], offset); + w[53] = amd_bytealign (w[44], w[45], offset); + w[52] = amd_bytealign (w[43], w[44], offset); + w[51] = amd_bytealign (w[42], w[43], offset); + w[50] = amd_bytealign (w[41], w[42], offset); + w[49] = amd_bytealign (w[40], w[41], offset); + w[48] = amd_bytealign (w[39], w[40], offset); + w[47] = amd_bytealign (w[38], w[39], offset); + w[46] = amd_bytealign (w[37], w[38], offset); + w[45] = amd_bytealign (w[36], w[37], offset); + w[44] = amd_bytealign (w[35], w[36], offset); + w[43] = amd_bytealign (w[34], w[35], offset); + w[42] = amd_bytealign (w[33], w[34], offset); + w[41] = amd_bytealign (w[32], w[33], offset); + w[40] = amd_bytealign (w[31], w[32], offset); + w[39] = amd_bytealign (w[30], w[31], offset); + w[38] = amd_bytealign (w[29], w[30], offset); + w[37] = amd_bytealign (w[28], w[29], offset); + w[36] = amd_bytealign (w[27], w[28], offset); + w[35] = amd_bytealign (w[26], w[27], offset); + w[34] = amd_bytealign (w[25], w[26], offset); + w[33] = amd_bytealign (w[24], w[25], offset); + w[32] = amd_bytealign (w[23], w[24], offset); + w[31] = amd_bytealign (w[22], w[23], offset); + w[30] = amd_bytealign (w[21], w[22], offset); + w[29] = amd_bytealign (w[20], w[21], offset); + w[28] = amd_bytealign (w[19], w[20], offset); + w[27] = amd_bytealign (w[18], w[19], offset); + w[26] = amd_bytealign (w[17], w[18], offset); + w[25] = amd_bytealign (w[16], w[17], offset); + w[24] = amd_bytealign (w[15], w[16], offset); + w[23] = amd_bytealign (w[14], w[15], offset); + w[22] = amd_bytealign (w[13], w[14], offset); + w[21] = amd_bytealign (w[12], w[13], offset); + w[20] = amd_bytealign (w[11], w[12], offset); + w[19] = amd_bytealign (w[10], w[11], offset); + w[18] = amd_bytealign (w[ 9], w[10], offset); + w[17] = amd_bytealign (w[ 8], w[ 9], offset); + w[16] = amd_bytealign (w[ 7], w[ 8], offset); + w[15] = amd_bytealign (w[ 6], w[ 7], offset); + w[14] = amd_bytealign (w[ 5], w[ 6], offset); + w[13] = amd_bytealign (w[ 4], w[ 5], offset); + w[12] = amd_bytealign (w[ 3], w[ 4], offset); + w[11] = amd_bytealign (w[ 2], w[ 3], offset); + w[10] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 9] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 8] = amd_bytealign ( 0, w[ 0], offset); + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 9: + w[63] = amd_bytealign (w[53], w[54], offset); + w[62] = amd_bytealign (w[52], w[53], offset); + w[61] = amd_bytealign (w[51], w[52], offset); + w[60] = amd_bytealign (w[50], w[51], offset); + w[59] = amd_bytealign (w[49], w[50], offset); + w[58] = amd_bytealign (w[48], w[49], offset); + w[57] = amd_bytealign (w[47], w[48], offset); + w[56] = amd_bytealign (w[46], w[47], offset); + w[55] = amd_bytealign (w[45], w[46], offset); + w[54] = amd_bytealign (w[44], w[45], offset); + w[53] = amd_bytealign (w[43], w[44], offset); + w[52] = amd_bytealign (w[42], w[43], offset); + w[51] = amd_bytealign (w[41], w[42], offset); + w[50] = amd_bytealign (w[40], w[41], offset); + w[49] = amd_bytealign (w[39], w[40], offset); + w[48] = amd_bytealign (w[38], w[39], offset); + w[47] = amd_bytealign (w[37], w[38], offset); + w[46] = amd_bytealign (w[36], w[37], offset); + w[45] = amd_bytealign (w[35], w[36], offset); + w[44] = amd_bytealign (w[34], w[35], offset); + w[43] = amd_bytealign (w[33], w[34], offset); + w[42] = amd_bytealign (w[32], w[33], offset); + w[41] = amd_bytealign (w[31], w[32], offset); + w[40] = amd_bytealign (w[30], w[31], offset); + w[39] = amd_bytealign (w[29], w[30], offset); + w[38] = amd_bytealign (w[28], w[29], offset); + w[37] = amd_bytealign (w[27], w[28], offset); + w[36] = amd_bytealign (w[26], w[27], offset); + w[35] = amd_bytealign (w[25], w[26], offset); + w[34] = amd_bytealign (w[24], w[25], offset); + w[33] = amd_bytealign (w[23], w[24], offset); + w[32] = amd_bytealign (w[22], w[23], offset); + w[31] = amd_bytealign (w[21], w[22], offset); + w[30] = amd_bytealign (w[20], w[21], offset); + w[29] = amd_bytealign (w[19], w[20], offset); + w[28] = amd_bytealign (w[18], w[19], offset); + w[27] = amd_bytealign (w[17], w[18], offset); + w[26] = amd_bytealign (w[16], w[17], offset); + w[25] = amd_bytealign (w[15], w[16], offset); + w[24] = amd_bytealign (w[14], w[15], offset); + w[23] = amd_bytealign (w[13], w[14], offset); + w[22] = amd_bytealign (w[12], w[13], offset); + w[21] = amd_bytealign (w[11], w[12], offset); + w[20] = amd_bytealign (w[10], w[11], offset); + w[19] = amd_bytealign (w[ 9], w[10], offset); + w[18] = amd_bytealign (w[ 8], w[ 9], offset); + w[17] = amd_bytealign (w[ 7], w[ 8], offset); + w[16] = amd_bytealign (w[ 6], w[ 7], offset); + w[15] = amd_bytealign (w[ 5], w[ 6], offset); + w[14] = amd_bytealign (w[ 4], w[ 5], offset); + w[13] = amd_bytealign (w[ 3], w[ 4], offset); + w[12] = amd_bytealign (w[ 2], w[ 3], offset); + w[11] = amd_bytealign (w[ 1], w[ 2], offset); + w[10] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 9] = amd_bytealign ( 0, w[ 0], offset); + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 10: + w[63] = amd_bytealign (w[52], w[53], offset); + w[62] = amd_bytealign (w[51], w[52], offset); + w[61] = amd_bytealign (w[50], w[51], offset); + w[60] = amd_bytealign (w[49], w[50], offset); + w[59] = amd_bytealign (w[48], w[49], offset); + w[58] = amd_bytealign (w[47], w[48], offset); + w[57] = amd_bytealign (w[46], w[47], offset); + w[56] = amd_bytealign (w[45], w[46], offset); + w[55] = amd_bytealign (w[44], w[45], offset); + w[54] = amd_bytealign (w[43], w[44], offset); + w[53] = amd_bytealign (w[42], w[43], offset); + w[52] = amd_bytealign (w[41], w[42], offset); + w[51] = amd_bytealign (w[40], w[41], offset); + w[50] = amd_bytealign (w[39], w[40], offset); + w[49] = amd_bytealign (w[38], w[39], offset); + w[48] = amd_bytealign (w[37], w[38], offset); + w[47] = amd_bytealign (w[36], w[37], offset); + w[46] = amd_bytealign (w[35], w[36], offset); + w[45] = amd_bytealign (w[34], w[35], offset); + w[44] = amd_bytealign (w[33], w[34], offset); + w[43] = amd_bytealign (w[32], w[33], offset); + w[42] = amd_bytealign (w[31], w[32], offset); + w[41] = amd_bytealign (w[30], w[31], offset); + w[40] = amd_bytealign (w[29], w[30], offset); + w[39] = amd_bytealign (w[28], w[29], offset); + w[38] = amd_bytealign (w[27], w[28], offset); + w[37] = amd_bytealign (w[26], w[27], offset); + w[36] = amd_bytealign (w[25], w[26], offset); + w[35] = amd_bytealign (w[24], w[25], offset); + w[34] = amd_bytealign (w[23], w[24], offset); + w[33] = amd_bytealign (w[22], w[23], offset); + w[32] = amd_bytealign (w[21], w[22], offset); + w[31] = amd_bytealign (w[20], w[21], offset); + w[30] = amd_bytealign (w[19], w[20], offset); + w[29] = amd_bytealign (w[18], w[19], offset); + w[28] = amd_bytealign (w[17], w[18], offset); + w[27] = amd_bytealign (w[16], w[17], offset); + w[26] = amd_bytealign (w[15], w[16], offset); + w[25] = amd_bytealign (w[14], w[15], offset); + w[24] = amd_bytealign (w[13], w[14], offset); + w[23] = amd_bytealign (w[12], w[13], offset); + w[22] = amd_bytealign (w[11], w[12], offset); + w[21] = amd_bytealign (w[10], w[11], offset); + w[20] = amd_bytealign (w[ 9], w[10], offset); + w[19] = amd_bytealign (w[ 8], w[ 9], offset); + w[18] = amd_bytealign (w[ 7], w[ 8], offset); + w[17] = amd_bytealign (w[ 6], w[ 7], offset); + w[16] = amd_bytealign (w[ 5], w[ 6], offset); + w[15] = amd_bytealign (w[ 4], w[ 5], offset); + w[14] = amd_bytealign (w[ 3], w[ 4], offset); + w[13] = amd_bytealign (w[ 2], w[ 3], offset); + w[12] = amd_bytealign (w[ 1], w[ 2], offset); + w[11] = amd_bytealign (w[ 0], w[ 1], offset); + w[10] = amd_bytealign ( 0, w[ 0], offset); + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 11: + w[63] = amd_bytealign (w[51], w[52], offset); + w[62] = amd_bytealign (w[50], w[51], offset); + w[61] = amd_bytealign (w[49], w[50], offset); + w[60] = amd_bytealign (w[48], w[49], offset); + w[59] = amd_bytealign (w[47], w[48], offset); + w[58] = amd_bytealign (w[46], w[47], offset); + w[57] = amd_bytealign (w[45], w[46], offset); + w[56] = amd_bytealign (w[44], w[45], offset); + w[55] = amd_bytealign (w[43], w[44], offset); + w[54] = amd_bytealign (w[42], w[43], offset); + w[53] = amd_bytealign (w[41], w[42], offset); + w[52] = amd_bytealign (w[40], w[41], offset); + w[51] = amd_bytealign (w[39], w[40], offset); + w[50] = amd_bytealign (w[38], w[39], offset); + w[49] = amd_bytealign (w[37], w[38], offset); + w[48] = amd_bytealign (w[36], w[37], offset); + w[47] = amd_bytealign (w[35], w[36], offset); + w[46] = amd_bytealign (w[34], w[35], offset); + w[45] = amd_bytealign (w[33], w[34], offset); + w[44] = amd_bytealign (w[32], w[33], offset); + w[43] = amd_bytealign (w[31], w[32], offset); + w[42] = amd_bytealign (w[30], w[31], offset); + w[41] = amd_bytealign (w[29], w[30], offset); + w[40] = amd_bytealign (w[28], w[29], offset); + w[39] = amd_bytealign (w[27], w[28], offset); + w[38] = amd_bytealign (w[26], w[27], offset); + w[37] = amd_bytealign (w[25], w[26], offset); + w[36] = amd_bytealign (w[24], w[25], offset); + w[35] = amd_bytealign (w[23], w[24], offset); + w[34] = amd_bytealign (w[22], w[23], offset); + w[33] = amd_bytealign (w[21], w[22], offset); + w[32] = amd_bytealign (w[20], w[21], offset); + w[31] = amd_bytealign (w[19], w[20], offset); + w[30] = amd_bytealign (w[18], w[19], offset); + w[29] = amd_bytealign (w[17], w[18], offset); + w[28] = amd_bytealign (w[16], w[17], offset); + w[27] = amd_bytealign (w[15], w[16], offset); + w[26] = amd_bytealign (w[14], w[15], offset); + w[25] = amd_bytealign (w[13], w[14], offset); + w[24] = amd_bytealign (w[12], w[13], offset); + w[23] = amd_bytealign (w[11], w[12], offset); + w[22] = amd_bytealign (w[10], w[11], offset); + w[21] = amd_bytealign (w[ 9], w[10], offset); + w[20] = amd_bytealign (w[ 8], w[ 9], offset); + w[19] = amd_bytealign (w[ 7], w[ 8], offset); + w[18] = amd_bytealign (w[ 6], w[ 7], offset); + w[17] = amd_bytealign (w[ 5], w[ 6], offset); + w[16] = amd_bytealign (w[ 4], w[ 5], offset); + w[15] = amd_bytealign (w[ 3], w[ 4], offset); + w[14] = amd_bytealign (w[ 2], w[ 3], offset); + w[13] = amd_bytealign (w[ 1], w[ 2], offset); + w[12] = amd_bytealign (w[ 0], w[ 1], offset); + w[11] = amd_bytealign ( 0, w[ 0], offset); + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 12: + w[63] = amd_bytealign (w[50], w[51], offset); + w[62] = amd_bytealign (w[49], w[50], offset); + w[61] = amd_bytealign (w[48], w[49], offset); + w[60] = amd_bytealign (w[47], w[48], offset); + w[59] = amd_bytealign (w[46], w[47], offset); + w[58] = amd_bytealign (w[45], w[46], offset); + w[57] = amd_bytealign (w[44], w[45], offset); + w[56] = amd_bytealign (w[43], w[44], offset); + w[55] = amd_bytealign (w[42], w[43], offset); + w[54] = amd_bytealign (w[41], w[42], offset); + w[53] = amd_bytealign (w[40], w[41], offset); + w[52] = amd_bytealign (w[39], w[40], offset); + w[51] = amd_bytealign (w[38], w[39], offset); + w[50] = amd_bytealign (w[37], w[38], offset); + w[49] = amd_bytealign (w[36], w[37], offset); + w[48] = amd_bytealign (w[35], w[36], offset); + w[47] = amd_bytealign (w[34], w[35], offset); + w[46] = amd_bytealign (w[33], w[34], offset); + w[45] = amd_bytealign (w[32], w[33], offset); + w[44] = amd_bytealign (w[31], w[32], offset); + w[43] = amd_bytealign (w[30], w[31], offset); + w[42] = amd_bytealign (w[29], w[30], offset); + w[41] = amd_bytealign (w[28], w[29], offset); + w[40] = amd_bytealign (w[27], w[28], offset); + w[39] = amd_bytealign (w[26], w[27], offset); + w[38] = amd_bytealign (w[25], w[26], offset); + w[37] = amd_bytealign (w[24], w[25], offset); + w[36] = amd_bytealign (w[23], w[24], offset); + w[35] = amd_bytealign (w[22], w[23], offset); + w[34] = amd_bytealign (w[21], w[22], offset); + w[33] = amd_bytealign (w[20], w[21], offset); + w[32] = amd_bytealign (w[19], w[20], offset); + w[31] = amd_bytealign (w[18], w[19], offset); + w[30] = amd_bytealign (w[17], w[18], offset); + w[29] = amd_bytealign (w[16], w[17], offset); + w[28] = amd_bytealign (w[15], w[16], offset); + w[27] = amd_bytealign (w[14], w[15], offset); + w[26] = amd_bytealign (w[13], w[14], offset); + w[25] = amd_bytealign (w[12], w[13], offset); + w[24] = amd_bytealign (w[11], w[12], offset); + w[23] = amd_bytealign (w[10], w[11], offset); + w[22] = amd_bytealign (w[ 9], w[10], offset); + w[21] = amd_bytealign (w[ 8], w[ 9], offset); + w[20] = amd_bytealign (w[ 7], w[ 8], offset); + w[19] = amd_bytealign (w[ 6], w[ 7], offset); + w[18] = amd_bytealign (w[ 5], w[ 6], offset); + w[17] = amd_bytealign (w[ 4], w[ 5], offset); + w[16] = amd_bytealign (w[ 3], w[ 4], offset); + w[15] = amd_bytealign (w[ 2], w[ 3], offset); + w[14] = amd_bytealign (w[ 1], w[ 2], offset); + w[13] = amd_bytealign (w[ 0], w[ 1], offset); + w[12] = amd_bytealign ( 0, w[ 0], offset); + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 13: + w[63] = amd_bytealign (w[49], w[50], offset); + w[62] = amd_bytealign (w[48], w[49], offset); + w[61] = amd_bytealign (w[47], w[48], offset); + w[60] = amd_bytealign (w[46], w[47], offset); + w[59] = amd_bytealign (w[45], w[46], offset); + w[58] = amd_bytealign (w[44], w[45], offset); + w[57] = amd_bytealign (w[43], w[44], offset); + w[56] = amd_bytealign (w[42], w[43], offset); + w[55] = amd_bytealign (w[41], w[42], offset); + w[54] = amd_bytealign (w[40], w[41], offset); + w[53] = amd_bytealign (w[39], w[40], offset); + w[52] = amd_bytealign (w[38], w[39], offset); + w[51] = amd_bytealign (w[37], w[38], offset); + w[50] = amd_bytealign (w[36], w[37], offset); + w[49] = amd_bytealign (w[35], w[36], offset); + w[48] = amd_bytealign (w[34], w[35], offset); + w[47] = amd_bytealign (w[33], w[34], offset); + w[46] = amd_bytealign (w[32], w[33], offset); + w[45] = amd_bytealign (w[31], w[32], offset); + w[44] = amd_bytealign (w[30], w[31], offset); + w[43] = amd_bytealign (w[29], w[30], offset); + w[42] = amd_bytealign (w[28], w[29], offset); + w[41] = amd_bytealign (w[27], w[28], offset); + w[40] = amd_bytealign (w[26], w[27], offset); + w[39] = amd_bytealign (w[25], w[26], offset); + w[38] = amd_bytealign (w[24], w[25], offset); + w[37] = amd_bytealign (w[23], w[24], offset); + w[36] = amd_bytealign (w[22], w[23], offset); + w[35] = amd_bytealign (w[21], w[22], offset); + w[34] = amd_bytealign (w[20], w[21], offset); + w[33] = amd_bytealign (w[19], w[20], offset); + w[32] = amd_bytealign (w[18], w[19], offset); + w[31] = amd_bytealign (w[17], w[18], offset); + w[30] = amd_bytealign (w[16], w[17], offset); + w[29] = amd_bytealign (w[15], w[16], offset); + w[28] = amd_bytealign (w[14], w[15], offset); + w[27] = amd_bytealign (w[13], w[14], offset); + w[26] = amd_bytealign (w[12], w[13], offset); + w[25] = amd_bytealign (w[11], w[12], offset); + w[24] = amd_bytealign (w[10], w[11], offset); + w[23] = amd_bytealign (w[ 9], w[10], offset); + w[22] = amd_bytealign (w[ 8], w[ 9], offset); + w[21] = amd_bytealign (w[ 7], w[ 8], offset); + w[20] = amd_bytealign (w[ 6], w[ 7], offset); + w[19] = amd_bytealign (w[ 5], w[ 6], offset); + w[18] = amd_bytealign (w[ 4], w[ 5], offset); + w[17] = amd_bytealign (w[ 3], w[ 4], offset); + w[16] = amd_bytealign (w[ 2], w[ 3], offset); + w[15] = amd_bytealign (w[ 1], w[ 2], offset); + w[14] = amd_bytealign (w[ 0], w[ 1], offset); + w[13] = amd_bytealign ( 0, w[ 0], offset); + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 14: + w[63] = amd_bytealign (w[48], w[49], offset); + w[62] = amd_bytealign (w[47], w[48], offset); + w[61] = amd_bytealign (w[46], w[47], offset); + w[60] = amd_bytealign (w[45], w[46], offset); + w[59] = amd_bytealign (w[44], w[45], offset); + w[58] = amd_bytealign (w[43], w[44], offset); + w[57] = amd_bytealign (w[42], w[43], offset); + w[56] = amd_bytealign (w[41], w[42], offset); + w[55] = amd_bytealign (w[40], w[41], offset); + w[54] = amd_bytealign (w[39], w[40], offset); + w[53] = amd_bytealign (w[38], w[39], offset); + w[52] = amd_bytealign (w[37], w[38], offset); + w[51] = amd_bytealign (w[36], w[37], offset); + w[50] = amd_bytealign (w[35], w[36], offset); + w[49] = amd_bytealign (w[34], w[35], offset); + w[48] = amd_bytealign (w[33], w[34], offset); + w[47] = amd_bytealign (w[32], w[33], offset); + w[46] = amd_bytealign (w[31], w[32], offset); + w[45] = amd_bytealign (w[30], w[31], offset); + w[44] = amd_bytealign (w[29], w[30], offset); + w[43] = amd_bytealign (w[28], w[29], offset); + w[42] = amd_bytealign (w[27], w[28], offset); + w[41] = amd_bytealign (w[26], w[27], offset); + w[40] = amd_bytealign (w[25], w[26], offset); + w[39] = amd_bytealign (w[24], w[25], offset); + w[38] = amd_bytealign (w[23], w[24], offset); + w[37] = amd_bytealign (w[22], w[23], offset); + w[36] = amd_bytealign (w[21], w[22], offset); + w[35] = amd_bytealign (w[20], w[21], offset); + w[34] = amd_bytealign (w[19], w[20], offset); + w[33] = amd_bytealign (w[18], w[19], offset); + w[32] = amd_bytealign (w[17], w[18], offset); + w[31] = amd_bytealign (w[16], w[17], offset); + w[30] = amd_bytealign (w[15], w[16], offset); + w[29] = amd_bytealign (w[14], w[15], offset); + w[28] = amd_bytealign (w[13], w[14], offset); + w[27] = amd_bytealign (w[12], w[13], offset); + w[26] = amd_bytealign (w[11], w[12], offset); + w[25] = amd_bytealign (w[10], w[11], offset); + w[24] = amd_bytealign (w[ 9], w[10], offset); + w[23] = amd_bytealign (w[ 8], w[ 9], offset); + w[22] = amd_bytealign (w[ 7], w[ 8], offset); + w[21] = amd_bytealign (w[ 6], w[ 7], offset); + w[20] = amd_bytealign (w[ 5], w[ 6], offset); + w[19] = amd_bytealign (w[ 4], w[ 5], offset); + w[18] = amd_bytealign (w[ 3], w[ 4], offset); + w[17] = amd_bytealign (w[ 2], w[ 3], offset); + w[16] = amd_bytealign (w[ 1], w[ 2], offset); + w[15] = amd_bytealign (w[ 0], w[ 1], offset); + w[14] = amd_bytealign ( 0, w[ 0], offset); + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 15: + w[63] = amd_bytealign (w[47], w[48], offset); + w[62] = amd_bytealign (w[46], w[47], offset); + w[61] = amd_bytealign (w[45], w[46], offset); + w[60] = amd_bytealign (w[44], w[45], offset); + w[59] = amd_bytealign (w[43], w[44], offset); + w[58] = amd_bytealign (w[42], w[43], offset); + w[57] = amd_bytealign (w[41], w[42], offset); + w[56] = amd_bytealign (w[40], w[41], offset); + w[55] = amd_bytealign (w[39], w[40], offset); + w[54] = amd_bytealign (w[38], w[39], offset); + w[53] = amd_bytealign (w[37], w[38], offset); + w[52] = amd_bytealign (w[36], w[37], offset); + w[51] = amd_bytealign (w[35], w[36], offset); + w[50] = amd_bytealign (w[34], w[35], offset); + w[49] = amd_bytealign (w[33], w[34], offset); + w[48] = amd_bytealign (w[32], w[33], offset); + w[47] = amd_bytealign (w[31], w[32], offset); + w[46] = amd_bytealign (w[30], w[31], offset); + w[45] = amd_bytealign (w[29], w[30], offset); + w[44] = amd_bytealign (w[28], w[29], offset); + w[43] = amd_bytealign (w[27], w[28], offset); + w[42] = amd_bytealign (w[26], w[27], offset); + w[41] = amd_bytealign (w[25], w[26], offset); + w[40] = amd_bytealign (w[24], w[25], offset); + w[39] = amd_bytealign (w[23], w[24], offset); + w[38] = amd_bytealign (w[22], w[23], offset); + w[37] = amd_bytealign (w[21], w[22], offset); + w[36] = amd_bytealign (w[20], w[21], offset); + w[35] = amd_bytealign (w[19], w[20], offset); + w[34] = amd_bytealign (w[18], w[19], offset); + w[33] = amd_bytealign (w[17], w[18], offset); + w[32] = amd_bytealign (w[16], w[17], offset); + w[31] = amd_bytealign (w[15], w[16], offset); + w[30] = amd_bytealign (w[14], w[15], offset); + w[29] = amd_bytealign (w[13], w[14], offset); + w[28] = amd_bytealign (w[12], w[13], offset); + w[27] = amd_bytealign (w[11], w[12], offset); + w[26] = amd_bytealign (w[10], w[11], offset); + w[25] = amd_bytealign (w[ 9], w[10], offset); + w[24] = amd_bytealign (w[ 8], w[ 9], offset); + w[23] = amd_bytealign (w[ 7], w[ 8], offset); + w[22] = amd_bytealign (w[ 6], w[ 7], offset); + w[21] = amd_bytealign (w[ 5], w[ 6], offset); + w[20] = amd_bytealign (w[ 4], w[ 5], offset); + w[19] = amd_bytealign (w[ 3], w[ 4], offset); + w[18] = amd_bytealign (w[ 2], w[ 3], offset); + w[17] = amd_bytealign (w[ 1], w[ 2], offset); + w[16] = amd_bytealign (w[ 0], w[ 1], offset); + w[15] = amd_bytealign ( 0, w[ 0], offset); + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 16: + w[63] = amd_bytealign (w[46], w[47], offset); + w[62] = amd_bytealign (w[45], w[46], offset); + w[61] = amd_bytealign (w[44], w[45], offset); + w[60] = amd_bytealign (w[43], w[44], offset); + w[59] = amd_bytealign (w[42], w[43], offset); + w[58] = amd_bytealign (w[41], w[42], offset); + w[57] = amd_bytealign (w[40], w[41], offset); + w[56] = amd_bytealign (w[39], w[40], offset); + w[55] = amd_bytealign (w[38], w[39], offset); + w[54] = amd_bytealign (w[37], w[38], offset); + w[53] = amd_bytealign (w[36], w[37], offset); + w[52] = amd_bytealign (w[35], w[36], offset); + w[51] = amd_bytealign (w[34], w[35], offset); + w[50] = amd_bytealign (w[33], w[34], offset); + w[49] = amd_bytealign (w[32], w[33], offset); + w[48] = amd_bytealign (w[31], w[32], offset); + w[47] = amd_bytealign (w[30], w[31], offset); + w[46] = amd_bytealign (w[29], w[30], offset); + w[45] = amd_bytealign (w[28], w[29], offset); + w[44] = amd_bytealign (w[27], w[28], offset); + w[43] = amd_bytealign (w[26], w[27], offset); + w[42] = amd_bytealign (w[25], w[26], offset); + w[41] = amd_bytealign (w[24], w[25], offset); + w[40] = amd_bytealign (w[23], w[24], offset); + w[39] = amd_bytealign (w[22], w[23], offset); + w[38] = amd_bytealign (w[21], w[22], offset); + w[37] = amd_bytealign (w[20], w[21], offset); + w[36] = amd_bytealign (w[19], w[20], offset); + w[35] = amd_bytealign (w[18], w[19], offset); + w[34] = amd_bytealign (w[17], w[18], offset); + w[33] = amd_bytealign (w[16], w[17], offset); + w[32] = amd_bytealign (w[15], w[16], offset); + w[31] = amd_bytealign (w[14], w[15], offset); + w[30] = amd_bytealign (w[13], w[14], offset); + w[29] = amd_bytealign (w[12], w[13], offset); + w[28] = amd_bytealign (w[11], w[12], offset); + w[27] = amd_bytealign (w[10], w[11], offset); + w[26] = amd_bytealign (w[ 9], w[10], offset); + w[25] = amd_bytealign (w[ 8], w[ 9], offset); + w[24] = amd_bytealign (w[ 7], w[ 8], offset); + w[23] = amd_bytealign (w[ 6], w[ 7], offset); + w[22] = amd_bytealign (w[ 5], w[ 6], offset); + w[21] = amd_bytealign (w[ 4], w[ 5], offset); + w[20] = amd_bytealign (w[ 3], w[ 4], offset); + w[19] = amd_bytealign (w[ 2], w[ 3], offset); + w[18] = amd_bytealign (w[ 1], w[ 2], offset); + w[17] = amd_bytealign (w[ 0], w[ 1], offset); + w[16] = amd_bytealign ( 0, w[ 0], offset); + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 17: + w[63] = amd_bytealign (w[45], w[46], offset); + w[62] = amd_bytealign (w[44], w[45], offset); + w[61] = amd_bytealign (w[43], w[44], offset); + w[60] = amd_bytealign (w[42], w[43], offset); + w[59] = amd_bytealign (w[41], w[42], offset); + w[58] = amd_bytealign (w[40], w[41], offset); + w[57] = amd_bytealign (w[39], w[40], offset); + w[56] = amd_bytealign (w[38], w[39], offset); + w[55] = amd_bytealign (w[37], w[38], offset); + w[54] = amd_bytealign (w[36], w[37], offset); + w[53] = amd_bytealign (w[35], w[36], offset); + w[52] = amd_bytealign (w[34], w[35], offset); + w[51] = amd_bytealign (w[33], w[34], offset); + w[50] = amd_bytealign (w[32], w[33], offset); + w[49] = amd_bytealign (w[31], w[32], offset); + w[48] = amd_bytealign (w[30], w[31], offset); + w[47] = amd_bytealign (w[29], w[30], offset); + w[46] = amd_bytealign (w[28], w[29], offset); + w[45] = amd_bytealign (w[27], w[28], offset); + w[44] = amd_bytealign (w[26], w[27], offset); + w[43] = amd_bytealign (w[25], w[26], offset); + w[42] = amd_bytealign (w[24], w[25], offset); + w[41] = amd_bytealign (w[23], w[24], offset); + w[40] = amd_bytealign (w[22], w[23], offset); + w[39] = amd_bytealign (w[21], w[22], offset); + w[38] = amd_bytealign (w[20], w[21], offset); + w[37] = amd_bytealign (w[19], w[20], offset); + w[36] = amd_bytealign (w[18], w[19], offset); + w[35] = amd_bytealign (w[17], w[18], offset); + w[34] = amd_bytealign (w[16], w[17], offset); + w[33] = amd_bytealign (w[15], w[16], offset); + w[32] = amd_bytealign (w[14], w[15], offset); + w[31] = amd_bytealign (w[13], w[14], offset); + w[30] = amd_bytealign (w[12], w[13], offset); + w[29] = amd_bytealign (w[11], w[12], offset); + w[28] = amd_bytealign (w[10], w[11], offset); + w[27] = amd_bytealign (w[ 9], w[10], offset); + w[26] = amd_bytealign (w[ 8], w[ 9], offset); + w[25] = amd_bytealign (w[ 7], w[ 8], offset); + w[24] = amd_bytealign (w[ 6], w[ 7], offset); + w[23] = amd_bytealign (w[ 5], w[ 6], offset); + w[22] = amd_bytealign (w[ 4], w[ 5], offset); + w[21] = amd_bytealign (w[ 3], w[ 4], offset); + w[20] = amd_bytealign (w[ 2], w[ 3], offset); + w[19] = amd_bytealign (w[ 1], w[ 2], offset); + w[18] = amd_bytealign (w[ 0], w[ 1], offset); + w[17] = amd_bytealign ( 0, w[ 0], offset); + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 18: + w[63] = amd_bytealign (w[44], w[45], offset); + w[62] = amd_bytealign (w[43], w[44], offset); + w[61] = amd_bytealign (w[42], w[43], offset); + w[60] = amd_bytealign (w[41], w[42], offset); + w[59] = amd_bytealign (w[40], w[41], offset); + w[58] = amd_bytealign (w[39], w[40], offset); + w[57] = amd_bytealign (w[38], w[39], offset); + w[56] = amd_bytealign (w[37], w[38], offset); + w[55] = amd_bytealign (w[36], w[37], offset); + w[54] = amd_bytealign (w[35], w[36], offset); + w[53] = amd_bytealign (w[34], w[35], offset); + w[52] = amd_bytealign (w[33], w[34], offset); + w[51] = amd_bytealign (w[32], w[33], offset); + w[50] = amd_bytealign (w[31], w[32], offset); + w[49] = amd_bytealign (w[30], w[31], offset); + w[48] = amd_bytealign (w[29], w[30], offset); + w[47] = amd_bytealign (w[28], w[29], offset); + w[46] = amd_bytealign (w[27], w[28], offset); + w[45] = amd_bytealign (w[26], w[27], offset); + w[44] = amd_bytealign (w[25], w[26], offset); + w[43] = amd_bytealign (w[24], w[25], offset); + w[42] = amd_bytealign (w[23], w[24], offset); + w[41] = amd_bytealign (w[22], w[23], offset); + w[40] = amd_bytealign (w[21], w[22], offset); + w[39] = amd_bytealign (w[20], w[21], offset); + w[38] = amd_bytealign (w[19], w[20], offset); + w[37] = amd_bytealign (w[18], w[19], offset); + w[36] = amd_bytealign (w[17], w[18], offset); + w[35] = amd_bytealign (w[16], w[17], offset); + w[34] = amd_bytealign (w[15], w[16], offset); + w[33] = amd_bytealign (w[14], w[15], offset); + w[32] = amd_bytealign (w[13], w[14], offset); + w[31] = amd_bytealign (w[12], w[13], offset); + w[30] = amd_bytealign (w[11], w[12], offset); + w[29] = amd_bytealign (w[10], w[11], offset); + w[28] = amd_bytealign (w[ 9], w[10], offset); + w[27] = amd_bytealign (w[ 8], w[ 9], offset); + w[26] = amd_bytealign (w[ 7], w[ 8], offset); + w[25] = amd_bytealign (w[ 6], w[ 7], offset); + w[24] = amd_bytealign (w[ 5], w[ 6], offset); + w[23] = amd_bytealign (w[ 4], w[ 5], offset); + w[22] = amd_bytealign (w[ 3], w[ 4], offset); + w[21] = amd_bytealign (w[ 2], w[ 3], offset); + w[20] = amd_bytealign (w[ 1], w[ 2], offset); + w[19] = amd_bytealign (w[ 0], w[ 1], offset); + w[18] = amd_bytealign ( 0, w[ 0], offset); + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 19: + w[63] = amd_bytealign (w[43], w[44], offset); + w[62] = amd_bytealign (w[42], w[43], offset); + w[61] = amd_bytealign (w[41], w[42], offset); + w[60] = amd_bytealign (w[40], w[41], offset); + w[59] = amd_bytealign (w[39], w[40], offset); + w[58] = amd_bytealign (w[38], w[39], offset); + w[57] = amd_bytealign (w[37], w[38], offset); + w[56] = amd_bytealign (w[36], w[37], offset); + w[55] = amd_bytealign (w[35], w[36], offset); + w[54] = amd_bytealign (w[34], w[35], offset); + w[53] = amd_bytealign (w[33], w[34], offset); + w[52] = amd_bytealign (w[32], w[33], offset); + w[51] = amd_bytealign (w[31], w[32], offset); + w[50] = amd_bytealign (w[30], w[31], offset); + w[49] = amd_bytealign (w[29], w[30], offset); + w[48] = amd_bytealign (w[28], w[29], offset); + w[47] = amd_bytealign (w[27], w[28], offset); + w[46] = amd_bytealign (w[26], w[27], offset); + w[45] = amd_bytealign (w[25], w[26], offset); + w[44] = amd_bytealign (w[24], w[25], offset); + w[43] = amd_bytealign (w[23], w[24], offset); + w[42] = amd_bytealign (w[22], w[23], offset); + w[41] = amd_bytealign (w[21], w[22], offset); + w[40] = amd_bytealign (w[20], w[21], offset); + w[39] = amd_bytealign (w[19], w[20], offset); + w[38] = amd_bytealign (w[18], w[19], offset); + w[37] = amd_bytealign (w[17], w[18], offset); + w[36] = amd_bytealign (w[16], w[17], offset); + w[35] = amd_bytealign (w[15], w[16], offset); + w[34] = amd_bytealign (w[14], w[15], offset); + w[33] = amd_bytealign (w[13], w[14], offset); + w[32] = amd_bytealign (w[12], w[13], offset); + w[31] = amd_bytealign (w[11], w[12], offset); + w[30] = amd_bytealign (w[10], w[11], offset); + w[29] = amd_bytealign (w[ 9], w[10], offset); + w[28] = amd_bytealign (w[ 8], w[ 9], offset); + w[27] = amd_bytealign (w[ 7], w[ 8], offset); + w[26] = amd_bytealign (w[ 6], w[ 7], offset); + w[25] = amd_bytealign (w[ 5], w[ 6], offset); + w[24] = amd_bytealign (w[ 4], w[ 5], offset); + w[23] = amd_bytealign (w[ 3], w[ 4], offset); + w[22] = amd_bytealign (w[ 2], w[ 3], offset); + w[21] = amd_bytealign (w[ 1], w[ 2], offset); + w[20] = amd_bytealign (w[ 0], w[ 1], offset); + w[19] = amd_bytealign ( 0, w[ 0], offset); + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 20: + w[63] = amd_bytealign (w[42], w[43], offset); + w[62] = amd_bytealign (w[41], w[42], offset); + w[61] = amd_bytealign (w[40], w[41], offset); + w[60] = amd_bytealign (w[39], w[40], offset); + w[59] = amd_bytealign (w[38], w[39], offset); + w[58] = amd_bytealign (w[37], w[38], offset); + w[57] = amd_bytealign (w[36], w[37], offset); + w[56] = amd_bytealign (w[35], w[36], offset); + w[55] = amd_bytealign (w[34], w[35], offset); + w[54] = amd_bytealign (w[33], w[34], offset); + w[53] = amd_bytealign (w[32], w[33], offset); + w[52] = amd_bytealign (w[31], w[32], offset); + w[51] = amd_bytealign (w[30], w[31], offset); + w[50] = amd_bytealign (w[29], w[30], offset); + w[49] = amd_bytealign (w[28], w[29], offset); + w[48] = amd_bytealign (w[27], w[28], offset); + w[47] = amd_bytealign (w[26], w[27], offset); + w[46] = amd_bytealign (w[25], w[26], offset); + w[45] = amd_bytealign (w[24], w[25], offset); + w[44] = amd_bytealign (w[23], w[24], offset); + w[43] = amd_bytealign (w[22], w[23], offset); + w[42] = amd_bytealign (w[21], w[22], offset); + w[41] = amd_bytealign (w[20], w[21], offset); + w[40] = amd_bytealign (w[19], w[20], offset); + w[39] = amd_bytealign (w[18], w[19], offset); + w[38] = amd_bytealign (w[17], w[18], offset); + w[37] = amd_bytealign (w[16], w[17], offset); + w[36] = amd_bytealign (w[15], w[16], offset); + w[35] = amd_bytealign (w[14], w[15], offset); + w[34] = amd_bytealign (w[13], w[14], offset); + w[33] = amd_bytealign (w[12], w[13], offset); + w[32] = amd_bytealign (w[11], w[12], offset); + w[31] = amd_bytealign (w[10], w[11], offset); + w[30] = amd_bytealign (w[ 9], w[10], offset); + w[29] = amd_bytealign (w[ 8], w[ 9], offset); + w[28] = amd_bytealign (w[ 7], w[ 8], offset); + w[27] = amd_bytealign (w[ 6], w[ 7], offset); + w[26] = amd_bytealign (w[ 5], w[ 6], offset); + w[25] = amd_bytealign (w[ 4], w[ 5], offset); + w[24] = amd_bytealign (w[ 3], w[ 4], offset); + w[23] = amd_bytealign (w[ 2], w[ 3], offset); + w[22] = amd_bytealign (w[ 1], w[ 2], offset); + w[21] = amd_bytealign (w[ 0], w[ 1], offset); + w[20] = amd_bytealign ( 0, w[ 0], offset); + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 21: + w[63] = amd_bytealign (w[41], w[42], offset); + w[62] = amd_bytealign (w[40], w[41], offset); + w[61] = amd_bytealign (w[39], w[40], offset); + w[60] = amd_bytealign (w[38], w[39], offset); + w[59] = amd_bytealign (w[37], w[38], offset); + w[58] = amd_bytealign (w[36], w[37], offset); + w[57] = amd_bytealign (w[35], w[36], offset); + w[56] = amd_bytealign (w[34], w[35], offset); + w[55] = amd_bytealign (w[33], w[34], offset); + w[54] = amd_bytealign (w[32], w[33], offset); + w[53] = amd_bytealign (w[31], w[32], offset); + w[52] = amd_bytealign (w[30], w[31], offset); + w[51] = amd_bytealign (w[29], w[30], offset); + w[50] = amd_bytealign (w[28], w[29], offset); + w[49] = amd_bytealign (w[27], w[28], offset); + w[48] = amd_bytealign (w[26], w[27], offset); + w[47] = amd_bytealign (w[25], w[26], offset); + w[46] = amd_bytealign (w[24], w[25], offset); + w[45] = amd_bytealign (w[23], w[24], offset); + w[44] = amd_bytealign (w[22], w[23], offset); + w[43] = amd_bytealign (w[21], w[22], offset); + w[42] = amd_bytealign (w[20], w[21], offset); + w[41] = amd_bytealign (w[19], w[20], offset); + w[40] = amd_bytealign (w[18], w[19], offset); + w[39] = amd_bytealign (w[17], w[18], offset); + w[38] = amd_bytealign (w[16], w[17], offset); + w[37] = amd_bytealign (w[15], w[16], offset); + w[36] = amd_bytealign (w[14], w[15], offset); + w[35] = amd_bytealign (w[13], w[14], offset); + w[34] = amd_bytealign (w[12], w[13], offset); + w[33] = amd_bytealign (w[11], w[12], offset); + w[32] = amd_bytealign (w[10], w[11], offset); + w[31] = amd_bytealign (w[ 9], w[10], offset); + w[30] = amd_bytealign (w[ 8], w[ 9], offset); + w[29] = amd_bytealign (w[ 7], w[ 8], offset); + w[28] = amd_bytealign (w[ 6], w[ 7], offset); + w[27] = amd_bytealign (w[ 5], w[ 6], offset); + w[26] = amd_bytealign (w[ 4], w[ 5], offset); + w[25] = amd_bytealign (w[ 3], w[ 4], offset); + w[24] = amd_bytealign (w[ 2], w[ 3], offset); + w[23] = amd_bytealign (w[ 1], w[ 2], offset); + w[22] = amd_bytealign (w[ 0], w[ 1], offset); + w[21] = amd_bytealign ( 0, w[ 0], offset); + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 22: + w[63] = amd_bytealign (w[40], w[41], offset); + w[62] = amd_bytealign (w[39], w[40], offset); + w[61] = amd_bytealign (w[38], w[39], offset); + w[60] = amd_bytealign (w[37], w[38], offset); + w[59] = amd_bytealign (w[36], w[37], offset); + w[58] = amd_bytealign (w[35], w[36], offset); + w[57] = amd_bytealign (w[34], w[35], offset); + w[56] = amd_bytealign (w[33], w[34], offset); + w[55] = amd_bytealign (w[32], w[33], offset); + w[54] = amd_bytealign (w[31], w[32], offset); + w[53] = amd_bytealign (w[30], w[31], offset); + w[52] = amd_bytealign (w[29], w[30], offset); + w[51] = amd_bytealign (w[28], w[29], offset); + w[50] = amd_bytealign (w[27], w[28], offset); + w[49] = amd_bytealign (w[26], w[27], offset); + w[48] = amd_bytealign (w[25], w[26], offset); + w[47] = amd_bytealign (w[24], w[25], offset); + w[46] = amd_bytealign (w[23], w[24], offset); + w[45] = amd_bytealign (w[22], w[23], offset); + w[44] = amd_bytealign (w[21], w[22], offset); + w[43] = amd_bytealign (w[20], w[21], offset); + w[42] = amd_bytealign (w[19], w[20], offset); + w[41] = amd_bytealign (w[18], w[19], offset); + w[40] = amd_bytealign (w[17], w[18], offset); + w[39] = amd_bytealign (w[16], w[17], offset); + w[38] = amd_bytealign (w[15], w[16], offset); + w[37] = amd_bytealign (w[14], w[15], offset); + w[36] = amd_bytealign (w[13], w[14], offset); + w[35] = amd_bytealign (w[12], w[13], offset); + w[34] = amd_bytealign (w[11], w[12], offset); + w[33] = amd_bytealign (w[10], w[11], offset); + w[32] = amd_bytealign (w[ 9], w[10], offset); + w[31] = amd_bytealign (w[ 8], w[ 9], offset); + w[30] = amd_bytealign (w[ 7], w[ 8], offset); + w[29] = amd_bytealign (w[ 6], w[ 7], offset); + w[28] = amd_bytealign (w[ 5], w[ 6], offset); + w[27] = amd_bytealign (w[ 4], w[ 5], offset); + w[26] = amd_bytealign (w[ 3], w[ 4], offset); + w[25] = amd_bytealign (w[ 2], w[ 3], offset); + w[24] = amd_bytealign (w[ 1], w[ 2], offset); + w[23] = amd_bytealign (w[ 0], w[ 1], offset); + w[22] = amd_bytealign ( 0, w[ 0], offset); + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 23: + w[63] = amd_bytealign (w[39], w[40], offset); + w[62] = amd_bytealign (w[38], w[39], offset); + w[61] = amd_bytealign (w[37], w[38], offset); + w[60] = amd_bytealign (w[36], w[37], offset); + w[59] = amd_bytealign (w[35], w[36], offset); + w[58] = amd_bytealign (w[34], w[35], offset); + w[57] = amd_bytealign (w[33], w[34], offset); + w[56] = amd_bytealign (w[32], w[33], offset); + w[55] = amd_bytealign (w[31], w[32], offset); + w[54] = amd_bytealign (w[30], w[31], offset); + w[53] = amd_bytealign (w[29], w[30], offset); + w[52] = amd_bytealign (w[28], w[29], offset); + w[51] = amd_bytealign (w[27], w[28], offset); + w[50] = amd_bytealign (w[26], w[27], offset); + w[49] = amd_bytealign (w[25], w[26], offset); + w[48] = amd_bytealign (w[24], w[25], offset); + w[47] = amd_bytealign (w[23], w[24], offset); + w[46] = amd_bytealign (w[22], w[23], offset); + w[45] = amd_bytealign (w[21], w[22], offset); + w[44] = amd_bytealign (w[20], w[21], offset); + w[43] = amd_bytealign (w[19], w[20], offset); + w[42] = amd_bytealign (w[18], w[19], offset); + w[41] = amd_bytealign (w[17], w[18], offset); + w[40] = amd_bytealign (w[16], w[17], offset); + w[39] = amd_bytealign (w[15], w[16], offset); + w[38] = amd_bytealign (w[14], w[15], offset); + w[37] = amd_bytealign (w[13], w[14], offset); + w[36] = amd_bytealign (w[12], w[13], offset); + w[35] = amd_bytealign (w[11], w[12], offset); + w[34] = amd_bytealign (w[10], w[11], offset); + w[33] = amd_bytealign (w[ 9], w[10], offset); + w[32] = amd_bytealign (w[ 8], w[ 9], offset); + w[31] = amd_bytealign (w[ 7], w[ 8], offset); + w[30] = amd_bytealign (w[ 6], w[ 7], offset); + w[29] = amd_bytealign (w[ 5], w[ 6], offset); + w[28] = amd_bytealign (w[ 4], w[ 5], offset); + w[27] = amd_bytealign (w[ 3], w[ 4], offset); + w[26] = amd_bytealign (w[ 2], w[ 3], offset); + w[25] = amd_bytealign (w[ 1], w[ 2], offset); + w[24] = amd_bytealign (w[ 0], w[ 1], offset); + w[23] = amd_bytealign ( 0, w[ 0], offset); + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 24: + w[63] = amd_bytealign (w[38], w[39], offset); + w[62] = amd_bytealign (w[37], w[38], offset); + w[61] = amd_bytealign (w[36], w[37], offset); + w[60] = amd_bytealign (w[35], w[36], offset); + w[59] = amd_bytealign (w[34], w[35], offset); + w[58] = amd_bytealign (w[33], w[34], offset); + w[57] = amd_bytealign (w[32], w[33], offset); + w[56] = amd_bytealign (w[31], w[32], offset); + w[55] = amd_bytealign (w[30], w[31], offset); + w[54] = amd_bytealign (w[29], w[30], offset); + w[53] = amd_bytealign (w[28], w[29], offset); + w[52] = amd_bytealign (w[27], w[28], offset); + w[51] = amd_bytealign (w[26], w[27], offset); + w[50] = amd_bytealign (w[25], w[26], offset); + w[49] = amd_bytealign (w[24], w[25], offset); + w[48] = amd_bytealign (w[23], w[24], offset); + w[47] = amd_bytealign (w[22], w[23], offset); + w[46] = amd_bytealign (w[21], w[22], offset); + w[45] = amd_bytealign (w[20], w[21], offset); + w[44] = amd_bytealign (w[19], w[20], offset); + w[43] = amd_bytealign (w[18], w[19], offset); + w[42] = amd_bytealign (w[17], w[18], offset); + w[41] = amd_bytealign (w[16], w[17], offset); + w[40] = amd_bytealign (w[15], w[16], offset); + w[39] = amd_bytealign (w[14], w[15], offset); + w[38] = amd_bytealign (w[13], w[14], offset); + w[37] = amd_bytealign (w[12], w[13], offset); + w[36] = amd_bytealign (w[11], w[12], offset); + w[35] = amd_bytealign (w[10], w[11], offset); + w[34] = amd_bytealign (w[ 9], w[10], offset); + w[33] = amd_bytealign (w[ 8], w[ 9], offset); + w[32] = amd_bytealign (w[ 7], w[ 8], offset); + w[31] = amd_bytealign (w[ 6], w[ 7], offset); + w[30] = amd_bytealign (w[ 5], w[ 6], offset); + w[29] = amd_bytealign (w[ 4], w[ 5], offset); + w[28] = amd_bytealign (w[ 3], w[ 4], offset); + w[27] = amd_bytealign (w[ 2], w[ 3], offset); + w[26] = amd_bytealign (w[ 1], w[ 2], offset); + w[25] = amd_bytealign (w[ 0], w[ 1], offset); + w[24] = amd_bytealign ( 0, w[ 0], offset); + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 25: + w[63] = amd_bytealign (w[37], w[38], offset); + w[62] = amd_bytealign (w[36], w[37], offset); + w[61] = amd_bytealign (w[35], w[36], offset); + w[60] = amd_bytealign (w[34], w[35], offset); + w[59] = amd_bytealign (w[33], w[34], offset); + w[58] = amd_bytealign (w[32], w[33], offset); + w[57] = amd_bytealign (w[31], w[32], offset); + w[56] = amd_bytealign (w[30], w[31], offset); + w[55] = amd_bytealign (w[29], w[30], offset); + w[54] = amd_bytealign (w[28], w[29], offset); + w[53] = amd_bytealign (w[27], w[28], offset); + w[52] = amd_bytealign (w[26], w[27], offset); + w[51] = amd_bytealign (w[25], w[26], offset); + w[50] = amd_bytealign (w[24], w[25], offset); + w[49] = amd_bytealign (w[23], w[24], offset); + w[48] = amd_bytealign (w[22], w[23], offset); + w[47] = amd_bytealign (w[21], w[22], offset); + w[46] = amd_bytealign (w[20], w[21], offset); + w[45] = amd_bytealign (w[19], w[20], offset); + w[44] = amd_bytealign (w[18], w[19], offset); + w[43] = amd_bytealign (w[17], w[18], offset); + w[42] = amd_bytealign (w[16], w[17], offset); + w[41] = amd_bytealign (w[15], w[16], offset); + w[40] = amd_bytealign (w[14], w[15], offset); + w[39] = amd_bytealign (w[13], w[14], offset); + w[38] = amd_bytealign (w[12], w[13], offset); + w[37] = amd_bytealign (w[11], w[12], offset); + w[36] = amd_bytealign (w[10], w[11], offset); + w[35] = amd_bytealign (w[ 9], w[10], offset); + w[34] = amd_bytealign (w[ 8], w[ 9], offset); + w[33] = amd_bytealign (w[ 7], w[ 8], offset); + w[32] = amd_bytealign (w[ 6], w[ 7], offset); + w[31] = amd_bytealign (w[ 5], w[ 6], offset); + w[30] = amd_bytealign (w[ 4], w[ 5], offset); + w[29] = amd_bytealign (w[ 3], w[ 4], offset); + w[28] = amd_bytealign (w[ 2], w[ 3], offset); + w[27] = amd_bytealign (w[ 1], w[ 2], offset); + w[26] = amd_bytealign (w[ 0], w[ 1], offset); + w[25] = amd_bytealign ( 0, w[ 0], offset); + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 26: + w[63] = amd_bytealign (w[36], w[37], offset); + w[62] = amd_bytealign (w[35], w[36], offset); + w[61] = amd_bytealign (w[34], w[35], offset); + w[60] = amd_bytealign (w[33], w[34], offset); + w[59] = amd_bytealign (w[32], w[33], offset); + w[58] = amd_bytealign (w[31], w[32], offset); + w[57] = amd_bytealign (w[30], w[31], offset); + w[56] = amd_bytealign (w[29], w[30], offset); + w[55] = amd_bytealign (w[28], w[29], offset); + w[54] = amd_bytealign (w[27], w[28], offset); + w[53] = amd_bytealign (w[26], w[27], offset); + w[52] = amd_bytealign (w[25], w[26], offset); + w[51] = amd_bytealign (w[24], w[25], offset); + w[50] = amd_bytealign (w[23], w[24], offset); + w[49] = amd_bytealign (w[22], w[23], offset); + w[48] = amd_bytealign (w[21], w[22], offset); + w[47] = amd_bytealign (w[20], w[21], offset); + w[46] = amd_bytealign (w[19], w[20], offset); + w[45] = amd_bytealign (w[18], w[19], offset); + w[44] = amd_bytealign (w[17], w[18], offset); + w[43] = amd_bytealign (w[16], w[17], offset); + w[42] = amd_bytealign (w[15], w[16], offset); + w[41] = amd_bytealign (w[14], w[15], offset); + w[40] = amd_bytealign (w[13], w[14], offset); + w[39] = amd_bytealign (w[12], w[13], offset); + w[38] = amd_bytealign (w[11], w[12], offset); + w[37] = amd_bytealign (w[10], w[11], offset); + w[36] = amd_bytealign (w[ 9], w[10], offset); + w[35] = amd_bytealign (w[ 8], w[ 9], offset); + w[34] = amd_bytealign (w[ 7], w[ 8], offset); + w[33] = amd_bytealign (w[ 6], w[ 7], offset); + w[32] = amd_bytealign (w[ 5], w[ 6], offset); + w[31] = amd_bytealign (w[ 4], w[ 5], offset); + w[30] = amd_bytealign (w[ 3], w[ 4], offset); + w[29] = amd_bytealign (w[ 2], w[ 3], offset); + w[28] = amd_bytealign (w[ 1], w[ 2], offset); + w[27] = amd_bytealign (w[ 0], w[ 1], offset); + w[26] = amd_bytealign ( 0, w[ 0], offset); + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 27: + w[63] = amd_bytealign (w[35], w[36], offset); + w[62] = amd_bytealign (w[34], w[35], offset); + w[61] = amd_bytealign (w[33], w[34], offset); + w[60] = amd_bytealign (w[32], w[33], offset); + w[59] = amd_bytealign (w[31], w[32], offset); + w[58] = amd_bytealign (w[30], w[31], offset); + w[57] = amd_bytealign (w[29], w[30], offset); + w[56] = amd_bytealign (w[28], w[29], offset); + w[55] = amd_bytealign (w[27], w[28], offset); + w[54] = amd_bytealign (w[26], w[27], offset); + w[53] = amd_bytealign (w[25], w[26], offset); + w[52] = amd_bytealign (w[24], w[25], offset); + w[51] = amd_bytealign (w[23], w[24], offset); + w[50] = amd_bytealign (w[22], w[23], offset); + w[49] = amd_bytealign (w[21], w[22], offset); + w[48] = amd_bytealign (w[20], w[21], offset); + w[47] = amd_bytealign (w[19], w[20], offset); + w[46] = amd_bytealign (w[18], w[19], offset); + w[45] = amd_bytealign (w[17], w[18], offset); + w[44] = amd_bytealign (w[16], w[17], offset); + w[43] = amd_bytealign (w[15], w[16], offset); + w[42] = amd_bytealign (w[14], w[15], offset); + w[41] = amd_bytealign (w[13], w[14], offset); + w[40] = amd_bytealign (w[12], w[13], offset); + w[39] = amd_bytealign (w[11], w[12], offset); + w[38] = amd_bytealign (w[10], w[11], offset); + w[37] = amd_bytealign (w[ 9], w[10], offset); + w[36] = amd_bytealign (w[ 8], w[ 9], offset); + w[35] = amd_bytealign (w[ 7], w[ 8], offset); + w[34] = amd_bytealign (w[ 6], w[ 7], offset); + w[33] = amd_bytealign (w[ 5], w[ 6], offset); + w[32] = amd_bytealign (w[ 4], w[ 5], offset); + w[31] = amd_bytealign (w[ 3], w[ 4], offset); + w[30] = amd_bytealign (w[ 2], w[ 3], offset); + w[29] = amd_bytealign (w[ 1], w[ 2], offset); + w[28] = amd_bytealign (w[ 0], w[ 1], offset); + w[27] = amd_bytealign ( 0, w[ 0], offset); + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 28: + w[63] = amd_bytealign (w[34], w[35], offset); + w[62] = amd_bytealign (w[33], w[34], offset); + w[61] = amd_bytealign (w[32], w[33], offset); + w[60] = amd_bytealign (w[31], w[32], offset); + w[59] = amd_bytealign (w[30], w[31], offset); + w[58] = amd_bytealign (w[29], w[30], offset); + w[57] = amd_bytealign (w[28], w[29], offset); + w[56] = amd_bytealign (w[27], w[28], offset); + w[55] = amd_bytealign (w[26], w[27], offset); + w[54] = amd_bytealign (w[25], w[26], offset); + w[53] = amd_bytealign (w[24], w[25], offset); + w[52] = amd_bytealign (w[23], w[24], offset); + w[51] = amd_bytealign (w[22], w[23], offset); + w[50] = amd_bytealign (w[21], w[22], offset); + w[49] = amd_bytealign (w[20], w[21], offset); + w[48] = amd_bytealign (w[19], w[20], offset); + w[47] = amd_bytealign (w[18], w[19], offset); + w[46] = amd_bytealign (w[17], w[18], offset); + w[45] = amd_bytealign (w[16], w[17], offset); + w[44] = amd_bytealign (w[15], w[16], offset); + w[43] = amd_bytealign (w[14], w[15], offset); + w[42] = amd_bytealign (w[13], w[14], offset); + w[41] = amd_bytealign (w[12], w[13], offset); + w[40] = amd_bytealign (w[11], w[12], offset); + w[39] = amd_bytealign (w[10], w[11], offset); + w[38] = amd_bytealign (w[ 9], w[10], offset); + w[37] = amd_bytealign (w[ 8], w[ 9], offset); + w[36] = amd_bytealign (w[ 7], w[ 8], offset); + w[35] = amd_bytealign (w[ 6], w[ 7], offset); + w[34] = amd_bytealign (w[ 5], w[ 6], offset); + w[33] = amd_bytealign (w[ 4], w[ 5], offset); + w[32] = amd_bytealign (w[ 3], w[ 4], offset); + w[31] = amd_bytealign (w[ 2], w[ 3], offset); + w[30] = amd_bytealign (w[ 1], w[ 2], offset); + w[29] = amd_bytealign (w[ 0], w[ 1], offset); + w[28] = amd_bytealign ( 0, w[ 0], offset); + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 29: + w[63] = amd_bytealign (w[33], w[34], offset); + w[62] = amd_bytealign (w[32], w[33], offset); + w[61] = amd_bytealign (w[31], w[32], offset); + w[60] = amd_bytealign (w[30], w[31], offset); + w[59] = amd_bytealign (w[29], w[30], offset); + w[58] = amd_bytealign (w[28], w[29], offset); + w[57] = amd_bytealign (w[27], w[28], offset); + w[56] = amd_bytealign (w[26], w[27], offset); + w[55] = amd_bytealign (w[25], w[26], offset); + w[54] = amd_bytealign (w[24], w[25], offset); + w[53] = amd_bytealign (w[23], w[24], offset); + w[52] = amd_bytealign (w[22], w[23], offset); + w[51] = amd_bytealign (w[21], w[22], offset); + w[50] = amd_bytealign (w[20], w[21], offset); + w[49] = amd_bytealign (w[19], w[20], offset); + w[48] = amd_bytealign (w[18], w[19], offset); + w[47] = amd_bytealign (w[17], w[18], offset); + w[46] = amd_bytealign (w[16], w[17], offset); + w[45] = amd_bytealign (w[15], w[16], offset); + w[44] = amd_bytealign (w[14], w[15], offset); + w[43] = amd_bytealign (w[13], w[14], offset); + w[42] = amd_bytealign (w[12], w[13], offset); + w[41] = amd_bytealign (w[11], w[12], offset); + w[40] = amd_bytealign (w[10], w[11], offset); + w[39] = amd_bytealign (w[ 9], w[10], offset); + w[38] = amd_bytealign (w[ 8], w[ 9], offset); + w[37] = amd_bytealign (w[ 7], w[ 8], offset); + w[36] = amd_bytealign (w[ 6], w[ 7], offset); + w[35] = amd_bytealign (w[ 5], w[ 6], offset); + w[34] = amd_bytealign (w[ 4], w[ 5], offset); + w[33] = amd_bytealign (w[ 3], w[ 4], offset); + w[32] = amd_bytealign (w[ 2], w[ 3], offset); + w[31] = amd_bytealign (w[ 1], w[ 2], offset); + w[30] = amd_bytealign (w[ 0], w[ 1], offset); + w[29] = amd_bytealign ( 0, w[ 0], offset); + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 30: + w[63] = amd_bytealign (w[32], w[33], offset); + w[62] = amd_bytealign (w[31], w[32], offset); + w[61] = amd_bytealign (w[30], w[31], offset); + w[60] = amd_bytealign (w[29], w[30], offset); + w[59] = amd_bytealign (w[28], w[29], offset); + w[58] = amd_bytealign (w[27], w[28], offset); + w[57] = amd_bytealign (w[26], w[27], offset); + w[56] = amd_bytealign (w[25], w[26], offset); + w[55] = amd_bytealign (w[24], w[25], offset); + w[54] = amd_bytealign (w[23], w[24], offset); + w[53] = amd_bytealign (w[22], w[23], offset); + w[52] = amd_bytealign (w[21], w[22], offset); + w[51] = amd_bytealign (w[20], w[21], offset); + w[50] = amd_bytealign (w[19], w[20], offset); + w[49] = amd_bytealign (w[18], w[19], offset); + w[48] = amd_bytealign (w[17], w[18], offset); + w[47] = amd_bytealign (w[16], w[17], offset); + w[46] = amd_bytealign (w[15], w[16], offset); + w[45] = amd_bytealign (w[14], w[15], offset); + w[44] = amd_bytealign (w[13], w[14], offset); + w[43] = amd_bytealign (w[12], w[13], offset); + w[42] = amd_bytealign (w[11], w[12], offset); + w[41] = amd_bytealign (w[10], w[11], offset); + w[40] = amd_bytealign (w[ 9], w[10], offset); + w[39] = amd_bytealign (w[ 8], w[ 9], offset); + w[38] = amd_bytealign (w[ 7], w[ 8], offset); + w[37] = amd_bytealign (w[ 6], w[ 7], offset); + w[36] = amd_bytealign (w[ 5], w[ 6], offset); + w[35] = amd_bytealign (w[ 4], w[ 5], offset); + w[34] = amd_bytealign (w[ 3], w[ 4], offset); + w[33] = amd_bytealign (w[ 2], w[ 3], offset); + w[32] = amd_bytealign (w[ 1], w[ 2], offset); + w[31] = amd_bytealign (w[ 0], w[ 1], offset); + w[30] = amd_bytealign ( 0, w[ 0], offset); + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 31: + w[63] = amd_bytealign (w[31], w[32], offset); + w[62] = amd_bytealign (w[30], w[31], offset); + w[61] = amd_bytealign (w[29], w[30], offset); + w[60] = amd_bytealign (w[28], w[29], offset); + w[59] = amd_bytealign (w[27], w[28], offset); + w[58] = amd_bytealign (w[26], w[27], offset); + w[57] = amd_bytealign (w[25], w[26], offset); + w[56] = amd_bytealign (w[24], w[25], offset); + w[55] = amd_bytealign (w[23], w[24], offset); + w[54] = amd_bytealign (w[22], w[23], offset); + w[53] = amd_bytealign (w[21], w[22], offset); + w[52] = amd_bytealign (w[20], w[21], offset); + w[51] = amd_bytealign (w[19], w[20], offset); + w[50] = amd_bytealign (w[18], w[19], offset); + w[49] = amd_bytealign (w[17], w[18], offset); + w[48] = amd_bytealign (w[16], w[17], offset); + w[47] = amd_bytealign (w[15], w[16], offset); + w[46] = amd_bytealign (w[14], w[15], offset); + w[45] = amd_bytealign (w[13], w[14], offset); + w[44] = amd_bytealign (w[12], w[13], offset); + w[43] = amd_bytealign (w[11], w[12], offset); + w[42] = amd_bytealign (w[10], w[11], offset); + w[41] = amd_bytealign (w[ 9], w[10], offset); + w[40] = amd_bytealign (w[ 8], w[ 9], offset); + w[39] = amd_bytealign (w[ 7], w[ 8], offset); + w[38] = amd_bytealign (w[ 6], w[ 7], offset); + w[37] = amd_bytealign (w[ 5], w[ 6], offset); + w[36] = amd_bytealign (w[ 4], w[ 5], offset); + w[35] = amd_bytealign (w[ 3], w[ 4], offset); + w[34] = amd_bytealign (w[ 2], w[ 3], offset); + w[33] = amd_bytealign (w[ 1], w[ 2], offset); + w[32] = amd_bytealign (w[ 0], w[ 1], offset); + w[31] = amd_bytealign ( 0, w[ 0], offset); + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 32: + w[63] = amd_bytealign (w[30], w[31], offset); + w[62] = amd_bytealign (w[29], w[30], offset); + w[61] = amd_bytealign (w[28], w[29], offset); + w[60] = amd_bytealign (w[27], w[28], offset); + w[59] = amd_bytealign (w[26], w[27], offset); + w[58] = amd_bytealign (w[25], w[26], offset); + w[57] = amd_bytealign (w[24], w[25], offset); + w[56] = amd_bytealign (w[23], w[24], offset); + w[55] = amd_bytealign (w[22], w[23], offset); + w[54] = amd_bytealign (w[21], w[22], offset); + w[53] = amd_bytealign (w[20], w[21], offset); + w[52] = amd_bytealign (w[19], w[20], offset); + w[51] = amd_bytealign (w[18], w[19], offset); + w[50] = amd_bytealign (w[17], w[18], offset); + w[49] = amd_bytealign (w[16], w[17], offset); + w[48] = amd_bytealign (w[15], w[16], offset); + w[47] = amd_bytealign (w[14], w[15], offset); + w[46] = amd_bytealign (w[13], w[14], offset); + w[45] = amd_bytealign (w[12], w[13], offset); + w[44] = amd_bytealign (w[11], w[12], offset); + w[43] = amd_bytealign (w[10], w[11], offset); + w[42] = amd_bytealign (w[ 9], w[10], offset); + w[41] = amd_bytealign (w[ 8], w[ 9], offset); + w[40] = amd_bytealign (w[ 7], w[ 8], offset); + w[39] = amd_bytealign (w[ 6], w[ 7], offset); + w[38] = amd_bytealign (w[ 5], w[ 6], offset); + w[37] = amd_bytealign (w[ 4], w[ 5], offset); + w[36] = amd_bytealign (w[ 3], w[ 4], offset); + w[35] = amd_bytealign (w[ 2], w[ 3], offset); + w[34] = amd_bytealign (w[ 1], w[ 2], offset); + w[33] = amd_bytealign (w[ 0], w[ 1], offset); + w[32] = amd_bytealign ( 0, w[ 0], offset); + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 33: + w[63] = amd_bytealign (w[29], w[30], offset); + w[62] = amd_bytealign (w[28], w[29], offset); + w[61] = amd_bytealign (w[27], w[28], offset); + w[60] = amd_bytealign (w[26], w[27], offset); + w[59] = amd_bytealign (w[25], w[26], offset); + w[58] = amd_bytealign (w[24], w[25], offset); + w[57] = amd_bytealign (w[23], w[24], offset); + w[56] = amd_bytealign (w[22], w[23], offset); + w[55] = amd_bytealign (w[21], w[22], offset); + w[54] = amd_bytealign (w[20], w[21], offset); + w[53] = amd_bytealign (w[19], w[20], offset); + w[52] = amd_bytealign (w[18], w[19], offset); + w[51] = amd_bytealign (w[17], w[18], offset); + w[50] = amd_bytealign (w[16], w[17], offset); + w[49] = amd_bytealign (w[15], w[16], offset); + w[48] = amd_bytealign (w[14], w[15], offset); + w[47] = amd_bytealign (w[13], w[14], offset); + w[46] = amd_bytealign (w[12], w[13], offset); + w[45] = amd_bytealign (w[11], w[12], offset); + w[44] = amd_bytealign (w[10], w[11], offset); + w[43] = amd_bytealign (w[ 9], w[10], offset); + w[42] = amd_bytealign (w[ 8], w[ 9], offset); + w[41] = amd_bytealign (w[ 7], w[ 8], offset); + w[40] = amd_bytealign (w[ 6], w[ 7], offset); + w[39] = amd_bytealign (w[ 5], w[ 6], offset); + w[38] = amd_bytealign (w[ 4], w[ 5], offset); + w[37] = amd_bytealign (w[ 3], w[ 4], offset); + w[36] = amd_bytealign (w[ 2], w[ 3], offset); + w[35] = amd_bytealign (w[ 1], w[ 2], offset); + w[34] = amd_bytealign (w[ 0], w[ 1], offset); + w[33] = amd_bytealign ( 0, w[ 0], offset); + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 34: + w[63] = amd_bytealign (w[28], w[29], offset); + w[62] = amd_bytealign (w[27], w[28], offset); + w[61] = amd_bytealign (w[26], w[27], offset); + w[60] = amd_bytealign (w[25], w[26], offset); + w[59] = amd_bytealign (w[24], w[25], offset); + w[58] = amd_bytealign (w[23], w[24], offset); + w[57] = amd_bytealign (w[22], w[23], offset); + w[56] = amd_bytealign (w[21], w[22], offset); + w[55] = amd_bytealign (w[20], w[21], offset); + w[54] = amd_bytealign (w[19], w[20], offset); + w[53] = amd_bytealign (w[18], w[19], offset); + w[52] = amd_bytealign (w[17], w[18], offset); + w[51] = amd_bytealign (w[16], w[17], offset); + w[50] = amd_bytealign (w[15], w[16], offset); + w[49] = amd_bytealign (w[14], w[15], offset); + w[48] = amd_bytealign (w[13], w[14], offset); + w[47] = amd_bytealign (w[12], w[13], offset); + w[46] = amd_bytealign (w[11], w[12], offset); + w[45] = amd_bytealign (w[10], w[11], offset); + w[44] = amd_bytealign (w[ 9], w[10], offset); + w[43] = amd_bytealign (w[ 8], w[ 9], offset); + w[42] = amd_bytealign (w[ 7], w[ 8], offset); + w[41] = amd_bytealign (w[ 6], w[ 7], offset); + w[40] = amd_bytealign (w[ 5], w[ 6], offset); + w[39] = amd_bytealign (w[ 4], w[ 5], offset); + w[38] = amd_bytealign (w[ 3], w[ 4], offset); + w[37] = amd_bytealign (w[ 2], w[ 3], offset); + w[36] = amd_bytealign (w[ 1], w[ 2], offset); + w[35] = amd_bytealign (w[ 0], w[ 1], offset); + w[34] = amd_bytealign ( 0, w[ 0], offset); + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 35: + w[63] = amd_bytealign (w[27], w[28], offset); + w[62] = amd_bytealign (w[26], w[27], offset); + w[61] = amd_bytealign (w[25], w[26], offset); + w[60] = amd_bytealign (w[24], w[25], offset); + w[59] = amd_bytealign (w[23], w[24], offset); + w[58] = amd_bytealign (w[22], w[23], offset); + w[57] = amd_bytealign (w[21], w[22], offset); + w[56] = amd_bytealign (w[20], w[21], offset); + w[55] = amd_bytealign (w[19], w[20], offset); + w[54] = amd_bytealign (w[18], w[19], offset); + w[53] = amd_bytealign (w[17], w[18], offset); + w[52] = amd_bytealign (w[16], w[17], offset); + w[51] = amd_bytealign (w[15], w[16], offset); + w[50] = amd_bytealign (w[14], w[15], offset); + w[49] = amd_bytealign (w[13], w[14], offset); + w[48] = amd_bytealign (w[12], w[13], offset); + w[47] = amd_bytealign (w[11], w[12], offset); + w[46] = amd_bytealign (w[10], w[11], offset); + w[45] = amd_bytealign (w[ 9], w[10], offset); + w[44] = amd_bytealign (w[ 8], w[ 9], offset); + w[43] = amd_bytealign (w[ 7], w[ 8], offset); + w[42] = amd_bytealign (w[ 6], w[ 7], offset); + w[41] = amd_bytealign (w[ 5], w[ 6], offset); + w[40] = amd_bytealign (w[ 4], w[ 5], offset); + w[39] = amd_bytealign (w[ 3], w[ 4], offset); + w[38] = amd_bytealign (w[ 2], w[ 3], offset); + w[37] = amd_bytealign (w[ 1], w[ 2], offset); + w[36] = amd_bytealign (w[ 0], w[ 1], offset); + w[35] = amd_bytealign ( 0, w[ 0], offset); + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 36: + w[63] = amd_bytealign (w[26], w[27], offset); + w[62] = amd_bytealign (w[25], w[26], offset); + w[61] = amd_bytealign (w[24], w[25], offset); + w[60] = amd_bytealign (w[23], w[24], offset); + w[59] = amd_bytealign (w[22], w[23], offset); + w[58] = amd_bytealign (w[21], w[22], offset); + w[57] = amd_bytealign (w[20], w[21], offset); + w[56] = amd_bytealign (w[19], w[20], offset); + w[55] = amd_bytealign (w[18], w[19], offset); + w[54] = amd_bytealign (w[17], w[18], offset); + w[53] = amd_bytealign (w[16], w[17], offset); + w[52] = amd_bytealign (w[15], w[16], offset); + w[51] = amd_bytealign (w[14], w[15], offset); + w[50] = amd_bytealign (w[13], w[14], offset); + w[49] = amd_bytealign (w[12], w[13], offset); + w[48] = amd_bytealign (w[11], w[12], offset); + w[47] = amd_bytealign (w[10], w[11], offset); + w[46] = amd_bytealign (w[ 9], w[10], offset); + w[45] = amd_bytealign (w[ 8], w[ 9], offset); + w[44] = amd_bytealign (w[ 7], w[ 8], offset); + w[43] = amd_bytealign (w[ 6], w[ 7], offset); + w[42] = amd_bytealign (w[ 5], w[ 6], offset); + w[41] = amd_bytealign (w[ 4], w[ 5], offset); + w[40] = amd_bytealign (w[ 3], w[ 4], offset); + w[39] = amd_bytealign (w[ 2], w[ 3], offset); + w[38] = amd_bytealign (w[ 1], w[ 2], offset); + w[37] = amd_bytealign (w[ 0], w[ 1], offset); + w[36] = amd_bytealign ( 0, w[ 0], offset); + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 37: + w[63] = amd_bytealign (w[25], w[26], offset); + w[62] = amd_bytealign (w[24], w[25], offset); + w[61] = amd_bytealign (w[23], w[24], offset); + w[60] = amd_bytealign (w[22], w[23], offset); + w[59] = amd_bytealign (w[21], w[22], offset); + w[58] = amd_bytealign (w[20], w[21], offset); + w[57] = amd_bytealign (w[19], w[20], offset); + w[56] = amd_bytealign (w[18], w[19], offset); + w[55] = amd_bytealign (w[17], w[18], offset); + w[54] = amd_bytealign (w[16], w[17], offset); + w[53] = amd_bytealign (w[15], w[16], offset); + w[52] = amd_bytealign (w[14], w[15], offset); + w[51] = amd_bytealign (w[13], w[14], offset); + w[50] = amd_bytealign (w[12], w[13], offset); + w[49] = amd_bytealign (w[11], w[12], offset); + w[48] = amd_bytealign (w[10], w[11], offset); + w[47] = amd_bytealign (w[ 9], w[10], offset); + w[46] = amd_bytealign (w[ 8], w[ 9], offset); + w[45] = amd_bytealign (w[ 7], w[ 8], offset); + w[44] = amd_bytealign (w[ 6], w[ 7], offset); + w[43] = amd_bytealign (w[ 5], w[ 6], offset); + w[42] = amd_bytealign (w[ 4], w[ 5], offset); + w[41] = amd_bytealign (w[ 3], w[ 4], offset); + w[40] = amd_bytealign (w[ 2], w[ 3], offset); + w[39] = amd_bytealign (w[ 1], w[ 2], offset); + w[38] = amd_bytealign (w[ 0], w[ 1], offset); + w[37] = amd_bytealign ( 0, w[ 0], offset); + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 38: + w[63] = amd_bytealign (w[24], w[25], offset); + w[62] = amd_bytealign (w[23], w[24], offset); + w[61] = amd_bytealign (w[22], w[23], offset); + w[60] = amd_bytealign (w[21], w[22], offset); + w[59] = amd_bytealign (w[20], w[21], offset); + w[58] = amd_bytealign (w[19], w[20], offset); + w[57] = amd_bytealign (w[18], w[19], offset); + w[56] = amd_bytealign (w[17], w[18], offset); + w[55] = amd_bytealign (w[16], w[17], offset); + w[54] = amd_bytealign (w[15], w[16], offset); + w[53] = amd_bytealign (w[14], w[15], offset); + w[52] = amd_bytealign (w[13], w[14], offset); + w[51] = amd_bytealign (w[12], w[13], offset); + w[50] = amd_bytealign (w[11], w[12], offset); + w[49] = amd_bytealign (w[10], w[11], offset); + w[48] = amd_bytealign (w[ 9], w[10], offset); + w[47] = amd_bytealign (w[ 8], w[ 9], offset); + w[46] = amd_bytealign (w[ 7], w[ 8], offset); + w[45] = amd_bytealign (w[ 6], w[ 7], offset); + w[44] = amd_bytealign (w[ 5], w[ 6], offset); + w[43] = amd_bytealign (w[ 4], w[ 5], offset); + w[42] = amd_bytealign (w[ 3], w[ 4], offset); + w[41] = amd_bytealign (w[ 2], w[ 3], offset); + w[40] = amd_bytealign (w[ 1], w[ 2], offset); + w[39] = amd_bytealign (w[ 0], w[ 1], offset); + w[38] = amd_bytealign ( 0, w[ 0], offset); + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 39: + w[63] = amd_bytealign (w[23], w[24], offset); + w[62] = amd_bytealign (w[22], w[23], offset); + w[61] = amd_bytealign (w[21], w[22], offset); + w[60] = amd_bytealign (w[20], w[21], offset); + w[59] = amd_bytealign (w[19], w[20], offset); + w[58] = amd_bytealign (w[18], w[19], offset); + w[57] = amd_bytealign (w[17], w[18], offset); + w[56] = amd_bytealign (w[16], w[17], offset); + w[55] = amd_bytealign (w[15], w[16], offset); + w[54] = amd_bytealign (w[14], w[15], offset); + w[53] = amd_bytealign (w[13], w[14], offset); + w[52] = amd_bytealign (w[12], w[13], offset); + w[51] = amd_bytealign (w[11], w[12], offset); + w[50] = amd_bytealign (w[10], w[11], offset); + w[49] = amd_bytealign (w[ 9], w[10], offset); + w[48] = amd_bytealign (w[ 8], w[ 9], offset); + w[47] = amd_bytealign (w[ 7], w[ 8], offset); + w[46] = amd_bytealign (w[ 6], w[ 7], offset); + w[45] = amd_bytealign (w[ 5], w[ 6], offset); + w[44] = amd_bytealign (w[ 4], w[ 5], offset); + w[43] = amd_bytealign (w[ 3], w[ 4], offset); + w[42] = amd_bytealign (w[ 2], w[ 3], offset); + w[41] = amd_bytealign (w[ 1], w[ 2], offset); + w[40] = amd_bytealign (w[ 0], w[ 1], offset); + w[39] = amd_bytealign ( 0, w[ 0], offset); + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 40: + w[63] = amd_bytealign (w[22], w[23], offset); + w[62] = amd_bytealign (w[21], w[22], offset); + w[61] = amd_bytealign (w[20], w[21], offset); + w[60] = amd_bytealign (w[19], w[20], offset); + w[59] = amd_bytealign (w[18], w[19], offset); + w[58] = amd_bytealign (w[17], w[18], offset); + w[57] = amd_bytealign (w[16], w[17], offset); + w[56] = amd_bytealign (w[15], w[16], offset); + w[55] = amd_bytealign (w[14], w[15], offset); + w[54] = amd_bytealign (w[13], w[14], offset); + w[53] = amd_bytealign (w[12], w[13], offset); + w[52] = amd_bytealign (w[11], w[12], offset); + w[51] = amd_bytealign (w[10], w[11], offset); + w[50] = amd_bytealign (w[ 9], w[10], offset); + w[49] = amd_bytealign (w[ 8], w[ 9], offset); + w[48] = amd_bytealign (w[ 7], w[ 8], offset); + w[47] = amd_bytealign (w[ 6], w[ 7], offset); + w[46] = amd_bytealign (w[ 5], w[ 6], offset); + w[45] = amd_bytealign (w[ 4], w[ 5], offset); + w[44] = amd_bytealign (w[ 3], w[ 4], offset); + w[43] = amd_bytealign (w[ 2], w[ 3], offset); + w[42] = amd_bytealign (w[ 1], w[ 2], offset); + w[41] = amd_bytealign (w[ 0], w[ 1], offset); + w[40] = amd_bytealign ( 0, w[ 0], offset); + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 41: + w[63] = amd_bytealign (w[21], w[22], offset); + w[62] = amd_bytealign (w[20], w[21], offset); + w[61] = amd_bytealign (w[19], w[20], offset); + w[60] = amd_bytealign (w[18], w[19], offset); + w[59] = amd_bytealign (w[17], w[18], offset); + w[58] = amd_bytealign (w[16], w[17], offset); + w[57] = amd_bytealign (w[15], w[16], offset); + w[56] = amd_bytealign (w[14], w[15], offset); + w[55] = amd_bytealign (w[13], w[14], offset); + w[54] = amd_bytealign (w[12], w[13], offset); + w[53] = amd_bytealign (w[11], w[12], offset); + w[52] = amd_bytealign (w[10], w[11], offset); + w[51] = amd_bytealign (w[ 9], w[10], offset); + w[50] = amd_bytealign (w[ 8], w[ 9], offset); + w[49] = amd_bytealign (w[ 7], w[ 8], offset); + w[48] = amd_bytealign (w[ 6], w[ 7], offset); + w[47] = amd_bytealign (w[ 5], w[ 6], offset); + w[46] = amd_bytealign (w[ 4], w[ 5], offset); + w[45] = amd_bytealign (w[ 3], w[ 4], offset); + w[44] = amd_bytealign (w[ 2], w[ 3], offset); + w[43] = amd_bytealign (w[ 1], w[ 2], offset); + w[42] = amd_bytealign (w[ 0], w[ 1], offset); + w[41] = amd_bytealign ( 0, w[ 0], offset); + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 42: + w[63] = amd_bytealign (w[20], w[21], offset); + w[62] = amd_bytealign (w[19], w[20], offset); + w[61] = amd_bytealign (w[18], w[19], offset); + w[60] = amd_bytealign (w[17], w[18], offset); + w[59] = amd_bytealign (w[16], w[17], offset); + w[58] = amd_bytealign (w[15], w[16], offset); + w[57] = amd_bytealign (w[14], w[15], offset); + w[56] = amd_bytealign (w[13], w[14], offset); + w[55] = amd_bytealign (w[12], w[13], offset); + w[54] = amd_bytealign (w[11], w[12], offset); + w[53] = amd_bytealign (w[10], w[11], offset); + w[52] = amd_bytealign (w[ 9], w[10], offset); + w[51] = amd_bytealign (w[ 8], w[ 9], offset); + w[50] = amd_bytealign (w[ 7], w[ 8], offset); + w[49] = amd_bytealign (w[ 6], w[ 7], offset); + w[48] = amd_bytealign (w[ 5], w[ 6], offset); + w[47] = amd_bytealign (w[ 4], w[ 5], offset); + w[46] = amd_bytealign (w[ 3], w[ 4], offset); + w[45] = amd_bytealign (w[ 2], w[ 3], offset); + w[44] = amd_bytealign (w[ 1], w[ 2], offset); + w[43] = amd_bytealign (w[ 0], w[ 1], offset); + w[42] = amd_bytealign ( 0, w[ 0], offset); + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 43: + w[63] = amd_bytealign (w[19], w[20], offset); + w[62] = amd_bytealign (w[18], w[19], offset); + w[61] = amd_bytealign (w[17], w[18], offset); + w[60] = amd_bytealign (w[16], w[17], offset); + w[59] = amd_bytealign (w[15], w[16], offset); + w[58] = amd_bytealign (w[14], w[15], offset); + w[57] = amd_bytealign (w[13], w[14], offset); + w[56] = amd_bytealign (w[12], w[13], offset); + w[55] = amd_bytealign (w[11], w[12], offset); + w[54] = amd_bytealign (w[10], w[11], offset); + w[53] = amd_bytealign (w[ 9], w[10], offset); + w[52] = amd_bytealign (w[ 8], w[ 9], offset); + w[51] = amd_bytealign (w[ 7], w[ 8], offset); + w[50] = amd_bytealign (w[ 6], w[ 7], offset); + w[49] = amd_bytealign (w[ 5], w[ 6], offset); + w[48] = amd_bytealign (w[ 4], w[ 5], offset); + w[47] = amd_bytealign (w[ 3], w[ 4], offset); + w[46] = amd_bytealign (w[ 2], w[ 3], offset); + w[45] = amd_bytealign (w[ 1], w[ 2], offset); + w[44] = amd_bytealign (w[ 0], w[ 1], offset); + w[43] = amd_bytealign ( 0, w[ 0], offset); + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 44: + w[63] = amd_bytealign (w[18], w[19], offset); + w[62] = amd_bytealign (w[17], w[18], offset); + w[61] = amd_bytealign (w[16], w[17], offset); + w[60] = amd_bytealign (w[15], w[16], offset); + w[59] = amd_bytealign (w[14], w[15], offset); + w[58] = amd_bytealign (w[13], w[14], offset); + w[57] = amd_bytealign (w[12], w[13], offset); + w[56] = amd_bytealign (w[11], w[12], offset); + w[55] = amd_bytealign (w[10], w[11], offset); + w[54] = amd_bytealign (w[ 9], w[10], offset); + w[53] = amd_bytealign (w[ 8], w[ 9], offset); + w[52] = amd_bytealign (w[ 7], w[ 8], offset); + w[51] = amd_bytealign (w[ 6], w[ 7], offset); + w[50] = amd_bytealign (w[ 5], w[ 6], offset); + w[49] = amd_bytealign (w[ 4], w[ 5], offset); + w[48] = amd_bytealign (w[ 3], w[ 4], offset); + w[47] = amd_bytealign (w[ 2], w[ 3], offset); + w[46] = amd_bytealign (w[ 1], w[ 2], offset); + w[45] = amd_bytealign (w[ 0], w[ 1], offset); + w[44] = amd_bytealign ( 0, w[ 0], offset); + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 45: + w[63] = amd_bytealign (w[17], w[18], offset); + w[62] = amd_bytealign (w[16], w[17], offset); + w[61] = amd_bytealign (w[15], w[16], offset); + w[60] = amd_bytealign (w[14], w[15], offset); + w[59] = amd_bytealign (w[13], w[14], offset); + w[58] = amd_bytealign (w[12], w[13], offset); + w[57] = amd_bytealign (w[11], w[12], offset); + w[56] = amd_bytealign (w[10], w[11], offset); + w[55] = amd_bytealign (w[ 9], w[10], offset); + w[54] = amd_bytealign (w[ 8], w[ 9], offset); + w[53] = amd_bytealign (w[ 7], w[ 8], offset); + w[52] = amd_bytealign (w[ 6], w[ 7], offset); + w[51] = amd_bytealign (w[ 5], w[ 6], offset); + w[50] = amd_bytealign (w[ 4], w[ 5], offset); + w[49] = amd_bytealign (w[ 3], w[ 4], offset); + w[48] = amd_bytealign (w[ 2], w[ 3], offset); + w[47] = amd_bytealign (w[ 1], w[ 2], offset); + w[46] = amd_bytealign (w[ 0], w[ 1], offset); + w[45] = amd_bytealign ( 0, w[ 0], offset); + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 46: + w[63] = amd_bytealign (w[16], w[17], offset); + w[62] = amd_bytealign (w[15], w[16], offset); + w[61] = amd_bytealign (w[14], w[15], offset); + w[60] = amd_bytealign (w[13], w[14], offset); + w[59] = amd_bytealign (w[12], w[13], offset); + w[58] = amd_bytealign (w[11], w[12], offset); + w[57] = amd_bytealign (w[10], w[11], offset); + w[56] = amd_bytealign (w[ 9], w[10], offset); + w[55] = amd_bytealign (w[ 8], w[ 9], offset); + w[54] = amd_bytealign (w[ 7], w[ 8], offset); + w[53] = amd_bytealign (w[ 6], w[ 7], offset); + w[52] = amd_bytealign (w[ 5], w[ 6], offset); + w[51] = amd_bytealign (w[ 4], w[ 5], offset); + w[50] = amd_bytealign (w[ 3], w[ 4], offset); + w[49] = amd_bytealign (w[ 2], w[ 3], offset); + w[48] = amd_bytealign (w[ 1], w[ 2], offset); + w[47] = amd_bytealign (w[ 0], w[ 1], offset); + w[46] = amd_bytealign ( 0, w[ 0], offset); + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 47: + w[63] = amd_bytealign (w[15], w[16], offset); + w[62] = amd_bytealign (w[14], w[15], offset); + w[61] = amd_bytealign (w[13], w[14], offset); + w[60] = amd_bytealign (w[12], w[13], offset); + w[59] = amd_bytealign (w[11], w[12], offset); + w[58] = amd_bytealign (w[10], w[11], offset); + w[57] = amd_bytealign (w[ 9], w[10], offset); + w[56] = amd_bytealign (w[ 8], w[ 9], offset); + w[55] = amd_bytealign (w[ 7], w[ 8], offset); + w[54] = amd_bytealign (w[ 6], w[ 7], offset); + w[53] = amd_bytealign (w[ 5], w[ 6], offset); + w[52] = amd_bytealign (w[ 4], w[ 5], offset); + w[51] = amd_bytealign (w[ 3], w[ 4], offset); + w[50] = amd_bytealign (w[ 2], w[ 3], offset); + w[49] = amd_bytealign (w[ 1], w[ 2], offset); + w[48] = amd_bytealign (w[ 0], w[ 1], offset); + w[47] = amd_bytealign ( 0, w[ 0], offset); + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 48: + w[63] = amd_bytealign (w[14], w[15], offset); + w[62] = amd_bytealign (w[13], w[14], offset); + w[61] = amd_bytealign (w[12], w[13], offset); + w[60] = amd_bytealign (w[11], w[12], offset); + w[59] = amd_bytealign (w[10], w[11], offset); + w[58] = amd_bytealign (w[ 9], w[10], offset); + w[57] = amd_bytealign (w[ 8], w[ 9], offset); + w[56] = amd_bytealign (w[ 7], w[ 8], offset); + w[55] = amd_bytealign (w[ 6], w[ 7], offset); + w[54] = amd_bytealign (w[ 5], w[ 6], offset); + w[53] = amd_bytealign (w[ 4], w[ 5], offset); + w[52] = amd_bytealign (w[ 3], w[ 4], offset); + w[51] = amd_bytealign (w[ 2], w[ 3], offset); + w[50] = amd_bytealign (w[ 1], w[ 2], offset); + w[49] = amd_bytealign (w[ 0], w[ 1], offset); + w[48] = amd_bytealign ( 0, w[ 0], offset); + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 49: + w[63] = amd_bytealign (w[13], w[14], offset); + w[62] = amd_bytealign (w[12], w[13], offset); + w[61] = amd_bytealign (w[11], w[12], offset); + w[60] = amd_bytealign (w[10], w[11], offset); + w[59] = amd_bytealign (w[ 9], w[10], offset); + w[58] = amd_bytealign (w[ 8], w[ 9], offset); + w[57] = amd_bytealign (w[ 7], w[ 8], offset); + w[56] = amd_bytealign (w[ 6], w[ 7], offset); + w[55] = amd_bytealign (w[ 5], w[ 6], offset); + w[54] = amd_bytealign (w[ 4], w[ 5], offset); + w[53] = amd_bytealign (w[ 3], w[ 4], offset); + w[52] = amd_bytealign (w[ 2], w[ 3], offset); + w[51] = amd_bytealign (w[ 1], w[ 2], offset); + w[50] = amd_bytealign (w[ 0], w[ 1], offset); + w[49] = amd_bytealign ( 0, w[ 0], offset); + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 50: + w[63] = amd_bytealign (w[12], w[13], offset); + w[62] = amd_bytealign (w[11], w[12], offset); + w[61] = amd_bytealign (w[10], w[11], offset); + w[60] = amd_bytealign (w[ 9], w[10], offset); + w[59] = amd_bytealign (w[ 8], w[ 9], offset); + w[58] = amd_bytealign (w[ 7], w[ 8], offset); + w[57] = amd_bytealign (w[ 6], w[ 7], offset); + w[56] = amd_bytealign (w[ 5], w[ 6], offset); + w[55] = amd_bytealign (w[ 4], w[ 5], offset); + w[54] = amd_bytealign (w[ 3], w[ 4], offset); + w[53] = amd_bytealign (w[ 2], w[ 3], offset); + w[52] = amd_bytealign (w[ 1], w[ 2], offset); + w[51] = amd_bytealign (w[ 0], w[ 1], offset); + w[50] = amd_bytealign ( 0, w[ 0], offset); + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 51: + w[63] = amd_bytealign (w[11], w[12], offset); + w[62] = amd_bytealign (w[10], w[11], offset); + w[61] = amd_bytealign (w[ 9], w[10], offset); + w[60] = amd_bytealign (w[ 8], w[ 9], offset); + w[59] = amd_bytealign (w[ 7], w[ 8], offset); + w[58] = amd_bytealign (w[ 6], w[ 7], offset); + w[57] = amd_bytealign (w[ 5], w[ 6], offset); + w[56] = amd_bytealign (w[ 4], w[ 5], offset); + w[55] = amd_bytealign (w[ 3], w[ 4], offset); + w[54] = amd_bytealign (w[ 2], w[ 3], offset); + w[53] = amd_bytealign (w[ 1], w[ 2], offset); + w[52] = amd_bytealign (w[ 0], w[ 1], offset); + w[51] = amd_bytealign ( 0, w[ 0], offset); + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 52: + w[63] = amd_bytealign (w[10], w[11], offset); + w[62] = amd_bytealign (w[ 9], w[10], offset); + w[61] = amd_bytealign (w[ 8], w[ 9], offset); + w[60] = amd_bytealign (w[ 7], w[ 8], offset); + w[59] = amd_bytealign (w[ 6], w[ 7], offset); + w[58] = amd_bytealign (w[ 5], w[ 6], offset); + w[57] = amd_bytealign (w[ 4], w[ 5], offset); + w[56] = amd_bytealign (w[ 3], w[ 4], offset); + w[55] = amd_bytealign (w[ 2], w[ 3], offset); + w[54] = amd_bytealign (w[ 1], w[ 2], offset); + w[53] = amd_bytealign (w[ 0], w[ 1], offset); + w[52] = amd_bytealign ( 0, w[ 0], offset); + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 53: + w[63] = amd_bytealign (w[ 9], w[10], offset); + w[62] = amd_bytealign (w[ 8], w[ 9], offset); + w[61] = amd_bytealign (w[ 7], w[ 8], offset); + w[60] = amd_bytealign (w[ 6], w[ 7], offset); + w[59] = amd_bytealign (w[ 5], w[ 6], offset); + w[58] = amd_bytealign (w[ 4], w[ 5], offset); + w[57] = amd_bytealign (w[ 3], w[ 4], offset); + w[56] = amd_bytealign (w[ 2], w[ 3], offset); + w[55] = amd_bytealign (w[ 1], w[ 2], offset); + w[54] = amd_bytealign (w[ 0], w[ 1], offset); + w[53] = amd_bytealign ( 0, w[ 0], offset); + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 54: + w[63] = amd_bytealign (w[ 8], w[ 9], offset); + w[62] = amd_bytealign (w[ 7], w[ 8], offset); + w[61] = amd_bytealign (w[ 6], w[ 7], offset); + w[60] = amd_bytealign (w[ 5], w[ 6], offset); + w[59] = amd_bytealign (w[ 4], w[ 5], offset); + w[58] = amd_bytealign (w[ 3], w[ 4], offset); + w[57] = amd_bytealign (w[ 2], w[ 3], offset); + w[56] = amd_bytealign (w[ 1], w[ 2], offset); + w[55] = amd_bytealign (w[ 0], w[ 1], offset); + w[54] = amd_bytealign ( 0, w[ 0], offset); + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 55: + w[63] = amd_bytealign (w[ 7], w[ 8], offset); + w[62] = amd_bytealign (w[ 6], w[ 7], offset); + w[61] = amd_bytealign (w[ 5], w[ 6], offset); + w[60] = amd_bytealign (w[ 4], w[ 5], offset); + w[59] = amd_bytealign (w[ 3], w[ 4], offset); + w[58] = amd_bytealign (w[ 2], w[ 3], offset); + w[57] = amd_bytealign (w[ 1], w[ 2], offset); + w[56] = amd_bytealign (w[ 0], w[ 1], offset); + w[55] = amd_bytealign ( 0, w[ 0], offset); + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 56: + w[63] = amd_bytealign (w[ 6], w[ 7], offset); + w[62] = amd_bytealign (w[ 5], w[ 6], offset); + w[61] = amd_bytealign (w[ 4], w[ 5], offset); + w[60] = amd_bytealign (w[ 3], w[ 4], offset); + w[59] = amd_bytealign (w[ 2], w[ 3], offset); + w[58] = amd_bytealign (w[ 1], w[ 2], offset); + w[57] = amd_bytealign (w[ 0], w[ 1], offset); + w[56] = amd_bytealign ( 0, w[ 0], offset); + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 57: + w[63] = amd_bytealign (w[ 5], w[ 6], offset); + w[62] = amd_bytealign (w[ 4], w[ 5], offset); + w[61] = amd_bytealign (w[ 3], w[ 4], offset); + w[60] = amd_bytealign (w[ 2], w[ 3], offset); + w[59] = amd_bytealign (w[ 1], w[ 2], offset); + w[58] = amd_bytealign (w[ 0], w[ 1], offset); + w[57] = amd_bytealign ( 0, w[ 0], offset); + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 58: + w[63] = amd_bytealign (w[ 4], w[ 5], offset); + w[62] = amd_bytealign (w[ 3], w[ 4], offset); + w[61] = amd_bytealign (w[ 2], w[ 3], offset); + w[60] = amd_bytealign (w[ 1], w[ 2], offset); + w[59] = amd_bytealign (w[ 0], w[ 1], offset); + w[58] = amd_bytealign ( 0, w[ 0], offset); + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 59: + w[63] = amd_bytealign (w[ 3], w[ 4], offset); + w[62] = amd_bytealign (w[ 2], w[ 3], offset); + w[61] = amd_bytealign (w[ 1], w[ 2], offset); + w[60] = amd_bytealign (w[ 0], w[ 1], offset); + w[59] = amd_bytealign ( 0, w[ 0], offset); + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 60: + w[63] = amd_bytealign (w[ 2], w[ 3], offset); + w[62] = amd_bytealign (w[ 1], w[ 2], offset); + w[61] = amd_bytealign (w[ 0], w[ 1], offset); + w[60] = amd_bytealign ( 0, w[ 0], offset); + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 61: + w[63] = amd_bytealign (w[ 1], w[ 2], offset); + w[62] = amd_bytealign (w[ 0], w[ 1], offset); + w[61] = amd_bytealign ( 0, w[ 0], offset); + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 62: + w[63] = amd_bytealign (w[ 0], w[ 1], offset); + w[62] = amd_bytealign ( 0, w[ 0], offset); + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 63: + w[63] = amd_bytealign ( 0, w[ 0], offset); + w[62] = 0; + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + } + #endif + + #ifdef IS_NV + const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + + switch (offset / 4) + { + case 0: + w[63] = __byte_perm (w[63], w[62], selector); + w[62] = __byte_perm (w[62], w[61], selector); + w[61] = __byte_perm (w[61], w[60], selector); + w[60] = __byte_perm (w[60], w[59], selector); + w[59] = __byte_perm (w[59], w[58], selector); + w[58] = __byte_perm (w[58], w[57], selector); + w[57] = __byte_perm (w[57], w[56], selector); + w[56] = __byte_perm (w[56], w[55], selector); + w[55] = __byte_perm (w[55], w[54], selector); + w[54] = __byte_perm (w[54], w[53], selector); + w[53] = __byte_perm (w[53], w[52], selector); + w[52] = __byte_perm (w[52], w[51], selector); + w[51] = __byte_perm (w[51], w[50], selector); + w[50] = __byte_perm (w[50], w[49], selector); + w[49] = __byte_perm (w[49], w[48], selector); + w[48] = __byte_perm (w[48], w[47], selector); + w[47] = __byte_perm (w[47], w[46], selector); + w[46] = __byte_perm (w[46], w[45], selector); + w[45] = __byte_perm (w[45], w[44], selector); + w[44] = __byte_perm (w[44], w[43], selector); + w[43] = __byte_perm (w[43], w[42], selector); + w[42] = __byte_perm (w[42], w[41], selector); + w[41] = __byte_perm (w[41], w[40], selector); + w[40] = __byte_perm (w[40], w[39], selector); + w[39] = __byte_perm (w[39], w[38], selector); + w[38] = __byte_perm (w[38], w[37], selector); + w[37] = __byte_perm (w[37], w[36], selector); + w[36] = __byte_perm (w[36], w[35], selector); + w[35] = __byte_perm (w[35], w[34], selector); + w[34] = __byte_perm (w[34], w[33], selector); + w[33] = __byte_perm (w[33], w[32], selector); + w[32] = __byte_perm (w[32], w[31], selector); + w[31] = __byte_perm (w[31], w[30], selector); + w[30] = __byte_perm (w[30], w[29], selector); + w[29] = __byte_perm (w[29], w[28], selector); + w[28] = __byte_perm (w[28], w[27], selector); + w[27] = __byte_perm (w[27], w[26], selector); + w[26] = __byte_perm (w[26], w[25], selector); + w[25] = __byte_perm (w[25], w[24], selector); + w[24] = __byte_perm (w[24], w[23], selector); + w[23] = __byte_perm (w[23], w[22], selector); + w[22] = __byte_perm (w[22], w[21], selector); + w[21] = __byte_perm (w[21], w[20], selector); + w[20] = __byte_perm (w[20], w[19], selector); + w[19] = __byte_perm (w[19], w[18], selector); + w[18] = __byte_perm (w[18], w[17], selector); + w[17] = __byte_perm (w[17], w[16], selector); + w[16] = __byte_perm (w[16], w[15], selector); + w[15] = __byte_perm (w[15], w[14], selector); + w[14] = __byte_perm (w[14], w[13], selector); + w[13] = __byte_perm (w[13], w[12], selector); + w[12] = __byte_perm (w[12], w[11], selector); + w[11] = __byte_perm (w[11], w[10], selector); + w[10] = __byte_perm (w[10], w[ 9], selector); + w[ 9] = __byte_perm (w[ 9], w[ 8], selector); + w[ 8] = __byte_perm (w[ 8], w[ 7], selector); + w[ 7] = __byte_perm (w[ 7], w[ 6], selector); + w[ 6] = __byte_perm (w[ 6], w[ 5], selector); + w[ 5] = __byte_perm (w[ 5], w[ 4], selector); + w[ 4] = __byte_perm (w[ 4], w[ 3], selector); + w[ 3] = __byte_perm (w[ 3], w[ 2], selector); + w[ 2] = __byte_perm (w[ 2], w[ 1], selector); + w[ 1] = __byte_perm (w[ 1], w[ 0], selector); + w[ 0] = __byte_perm (w[ 0], 0, selector); + + break; + + case 1: + w[63] = __byte_perm (w[62], w[61], selector); + w[62] = __byte_perm (w[61], w[60], selector); + w[61] = __byte_perm (w[60], w[59], selector); + w[60] = __byte_perm (w[59], w[58], selector); + w[59] = __byte_perm (w[58], w[57], selector); + w[58] = __byte_perm (w[57], w[56], selector); + w[57] = __byte_perm (w[56], w[55], selector); + w[56] = __byte_perm (w[55], w[54], selector); + w[55] = __byte_perm (w[54], w[53], selector); + w[54] = __byte_perm (w[53], w[52], selector); + w[53] = __byte_perm (w[52], w[51], selector); + w[52] = __byte_perm (w[51], w[50], selector); + w[51] = __byte_perm (w[50], w[49], selector); + w[50] = __byte_perm (w[49], w[48], selector); + w[49] = __byte_perm (w[48], w[47], selector); + w[48] = __byte_perm (w[47], w[46], selector); + w[47] = __byte_perm (w[46], w[45], selector); + w[46] = __byte_perm (w[45], w[44], selector); + w[45] = __byte_perm (w[44], w[43], selector); + w[44] = __byte_perm (w[43], w[42], selector); + w[43] = __byte_perm (w[42], w[41], selector); + w[42] = __byte_perm (w[41], w[40], selector); + w[41] = __byte_perm (w[40], w[39], selector); + w[40] = __byte_perm (w[39], w[38], selector); + w[39] = __byte_perm (w[38], w[37], selector); + w[38] = __byte_perm (w[37], w[36], selector); + w[37] = __byte_perm (w[36], w[35], selector); + w[36] = __byte_perm (w[35], w[34], selector); + w[35] = __byte_perm (w[34], w[33], selector); + w[34] = __byte_perm (w[33], w[32], selector); + w[33] = __byte_perm (w[32], w[31], selector); + w[32] = __byte_perm (w[31], w[30], selector); + w[31] = __byte_perm (w[30], w[29], selector); + w[30] = __byte_perm (w[29], w[28], selector); + w[29] = __byte_perm (w[28], w[27], selector); + w[28] = __byte_perm (w[27], w[26], selector); + w[27] = __byte_perm (w[26], w[25], selector); + w[26] = __byte_perm (w[25], w[24], selector); + w[25] = __byte_perm (w[24], w[23], selector); + w[24] = __byte_perm (w[23], w[22], selector); + w[23] = __byte_perm (w[22], w[21], selector); + w[22] = __byte_perm (w[21], w[20], selector); + w[21] = __byte_perm (w[20], w[19], selector); + w[20] = __byte_perm (w[19], w[18], selector); + w[19] = __byte_perm (w[18], w[17], selector); + w[18] = __byte_perm (w[17], w[16], selector); + w[17] = __byte_perm (w[16], w[15], selector); + w[16] = __byte_perm (w[15], w[14], selector); + w[15] = __byte_perm (w[14], w[13], selector); + w[14] = __byte_perm (w[13], w[12], selector); + w[13] = __byte_perm (w[12], w[11], selector); + w[12] = __byte_perm (w[11], w[10], selector); + w[11] = __byte_perm (w[10], w[ 9], selector); + w[10] = __byte_perm (w[ 9], w[ 8], selector); + w[ 9] = __byte_perm (w[ 8], w[ 7], selector); + w[ 8] = __byte_perm (w[ 7], w[ 6], selector); + w[ 7] = __byte_perm (w[ 6], w[ 5], selector); + w[ 6] = __byte_perm (w[ 5], w[ 4], selector); + w[ 5] = __byte_perm (w[ 4], w[ 3], selector); + w[ 4] = __byte_perm (w[ 3], w[ 2], selector); + w[ 3] = __byte_perm (w[ 2], w[ 1], selector); + w[ 2] = __byte_perm (w[ 1], w[ 0], selector); + w[ 1] = __byte_perm (w[ 0], 0, selector); + w[ 0] = 0; + + break; + + case 2: + w[63] = __byte_perm (w[61], w[60], selector); + w[62] = __byte_perm (w[60], w[59], selector); + w[61] = __byte_perm (w[59], w[58], selector); + w[60] = __byte_perm (w[58], w[57], selector); + w[59] = __byte_perm (w[57], w[56], selector); + w[58] = __byte_perm (w[56], w[55], selector); + w[57] = __byte_perm (w[55], w[54], selector); + w[56] = __byte_perm (w[54], w[53], selector); + w[55] = __byte_perm (w[53], w[52], selector); + w[54] = __byte_perm (w[52], w[51], selector); + w[53] = __byte_perm (w[51], w[50], selector); + w[52] = __byte_perm (w[50], w[49], selector); + w[51] = __byte_perm (w[49], w[48], selector); + w[50] = __byte_perm (w[48], w[47], selector); + w[49] = __byte_perm (w[47], w[46], selector); + w[48] = __byte_perm (w[46], w[45], selector); + w[47] = __byte_perm (w[45], w[44], selector); + w[46] = __byte_perm (w[44], w[43], selector); + w[45] = __byte_perm (w[43], w[42], selector); + w[44] = __byte_perm (w[42], w[41], selector); + w[43] = __byte_perm (w[41], w[40], selector); + w[42] = __byte_perm (w[40], w[39], selector); + w[41] = __byte_perm (w[39], w[38], selector); + w[40] = __byte_perm (w[38], w[37], selector); + w[39] = __byte_perm (w[37], w[36], selector); + w[38] = __byte_perm (w[36], w[35], selector); + w[37] = __byte_perm (w[35], w[34], selector); + w[36] = __byte_perm (w[34], w[33], selector); + w[35] = __byte_perm (w[33], w[32], selector); + w[34] = __byte_perm (w[32], w[31], selector); + w[33] = __byte_perm (w[31], w[30], selector); + w[32] = __byte_perm (w[30], w[29], selector); + w[31] = __byte_perm (w[29], w[28], selector); + w[30] = __byte_perm (w[28], w[27], selector); + w[29] = __byte_perm (w[27], w[26], selector); + w[28] = __byte_perm (w[26], w[25], selector); + w[27] = __byte_perm (w[25], w[24], selector); + w[26] = __byte_perm (w[24], w[23], selector); + w[25] = __byte_perm (w[23], w[22], selector); + w[24] = __byte_perm (w[22], w[21], selector); + w[23] = __byte_perm (w[21], w[20], selector); + w[22] = __byte_perm (w[20], w[19], selector); + w[21] = __byte_perm (w[19], w[18], selector); + w[20] = __byte_perm (w[18], w[17], selector); + w[19] = __byte_perm (w[17], w[16], selector); + w[18] = __byte_perm (w[16], w[15], selector); + w[17] = __byte_perm (w[15], w[14], selector); + w[16] = __byte_perm (w[14], w[13], selector); + w[15] = __byte_perm (w[13], w[12], selector); + w[14] = __byte_perm (w[12], w[11], selector); + w[13] = __byte_perm (w[11], w[10], selector); + w[12] = __byte_perm (w[10], w[ 9], selector); + w[11] = __byte_perm (w[ 9], w[ 8], selector); + w[10] = __byte_perm (w[ 8], w[ 7], selector); + w[ 9] = __byte_perm (w[ 7], w[ 6], selector); + w[ 8] = __byte_perm (w[ 6], w[ 5], selector); + w[ 7] = __byte_perm (w[ 5], w[ 4], selector); + w[ 6] = __byte_perm (w[ 4], w[ 3], selector); + w[ 5] = __byte_perm (w[ 3], w[ 2], selector); + w[ 4] = __byte_perm (w[ 2], w[ 1], selector); + w[ 3] = __byte_perm (w[ 1], w[ 0], selector); + w[ 2] = __byte_perm (w[ 0], 0, selector); + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 3: + w[63] = __byte_perm (w[60], w[59], selector); + w[62] = __byte_perm (w[59], w[58], selector); + w[61] = __byte_perm (w[58], w[57], selector); + w[60] = __byte_perm (w[57], w[56], selector); + w[59] = __byte_perm (w[56], w[55], selector); + w[58] = __byte_perm (w[55], w[54], selector); + w[57] = __byte_perm (w[54], w[53], selector); + w[56] = __byte_perm (w[53], w[52], selector); + w[55] = __byte_perm (w[52], w[51], selector); + w[54] = __byte_perm (w[51], w[50], selector); + w[53] = __byte_perm (w[50], w[49], selector); + w[52] = __byte_perm (w[49], w[48], selector); + w[51] = __byte_perm (w[48], w[47], selector); + w[50] = __byte_perm (w[47], w[46], selector); + w[49] = __byte_perm (w[46], w[45], selector); + w[48] = __byte_perm (w[45], w[44], selector); + w[47] = __byte_perm (w[44], w[43], selector); + w[46] = __byte_perm (w[43], w[42], selector); + w[45] = __byte_perm (w[42], w[41], selector); + w[44] = __byte_perm (w[41], w[40], selector); + w[43] = __byte_perm (w[40], w[39], selector); + w[42] = __byte_perm (w[39], w[38], selector); + w[41] = __byte_perm (w[38], w[37], selector); + w[40] = __byte_perm (w[37], w[36], selector); + w[39] = __byte_perm (w[36], w[35], selector); + w[38] = __byte_perm (w[35], w[34], selector); + w[37] = __byte_perm (w[34], w[33], selector); + w[36] = __byte_perm (w[33], w[32], selector); + w[35] = __byte_perm (w[32], w[31], selector); + w[34] = __byte_perm (w[31], w[30], selector); + w[33] = __byte_perm (w[30], w[29], selector); + w[32] = __byte_perm (w[29], w[28], selector); + w[31] = __byte_perm (w[28], w[27], selector); + w[30] = __byte_perm (w[27], w[26], selector); + w[29] = __byte_perm (w[26], w[25], selector); + w[28] = __byte_perm (w[25], w[24], selector); + w[27] = __byte_perm (w[24], w[23], selector); + w[26] = __byte_perm (w[23], w[22], selector); + w[25] = __byte_perm (w[22], w[21], selector); + w[24] = __byte_perm (w[21], w[20], selector); + w[23] = __byte_perm (w[20], w[19], selector); + w[22] = __byte_perm (w[19], w[18], selector); + w[21] = __byte_perm (w[18], w[17], selector); + w[20] = __byte_perm (w[17], w[16], selector); + w[19] = __byte_perm (w[16], w[15], selector); + w[18] = __byte_perm (w[15], w[14], selector); + w[17] = __byte_perm (w[14], w[13], selector); + w[16] = __byte_perm (w[13], w[12], selector); + w[15] = __byte_perm (w[12], w[11], selector); + w[14] = __byte_perm (w[11], w[10], selector); + w[13] = __byte_perm (w[10], w[ 9], selector); + w[12] = __byte_perm (w[ 9], w[ 8], selector); + w[11] = __byte_perm (w[ 8], w[ 7], selector); + w[10] = __byte_perm (w[ 7], w[ 6], selector); + w[ 9] = __byte_perm (w[ 6], w[ 5], selector); + w[ 8] = __byte_perm (w[ 5], w[ 4], selector); + w[ 7] = __byte_perm (w[ 4], w[ 3], selector); + w[ 6] = __byte_perm (w[ 3], w[ 2], selector); + w[ 5] = __byte_perm (w[ 2], w[ 1], selector); + w[ 4] = __byte_perm (w[ 1], w[ 0], selector); + w[ 3] = __byte_perm (w[ 0], 0, selector); + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 4: + w[63] = __byte_perm (w[59], w[58], selector); + w[62] = __byte_perm (w[58], w[57], selector); + w[61] = __byte_perm (w[57], w[56], selector); + w[60] = __byte_perm (w[56], w[55], selector); + w[59] = __byte_perm (w[55], w[54], selector); + w[58] = __byte_perm (w[54], w[53], selector); + w[57] = __byte_perm (w[53], w[52], selector); + w[56] = __byte_perm (w[52], w[51], selector); + w[55] = __byte_perm (w[51], w[50], selector); + w[54] = __byte_perm (w[50], w[49], selector); + w[53] = __byte_perm (w[49], w[48], selector); + w[52] = __byte_perm (w[48], w[47], selector); + w[51] = __byte_perm (w[47], w[46], selector); + w[50] = __byte_perm (w[46], w[45], selector); + w[49] = __byte_perm (w[45], w[44], selector); + w[48] = __byte_perm (w[44], w[43], selector); + w[47] = __byte_perm (w[43], w[42], selector); + w[46] = __byte_perm (w[42], w[41], selector); + w[45] = __byte_perm (w[41], w[40], selector); + w[44] = __byte_perm (w[40], w[39], selector); + w[43] = __byte_perm (w[39], w[38], selector); + w[42] = __byte_perm (w[38], w[37], selector); + w[41] = __byte_perm (w[37], w[36], selector); + w[40] = __byte_perm (w[36], w[35], selector); + w[39] = __byte_perm (w[35], w[34], selector); + w[38] = __byte_perm (w[34], w[33], selector); + w[37] = __byte_perm (w[33], w[32], selector); + w[36] = __byte_perm (w[32], w[31], selector); + w[35] = __byte_perm (w[31], w[30], selector); + w[34] = __byte_perm (w[30], w[29], selector); + w[33] = __byte_perm (w[29], w[28], selector); + w[32] = __byte_perm (w[28], w[27], selector); + w[31] = __byte_perm (w[27], w[26], selector); + w[30] = __byte_perm (w[26], w[25], selector); + w[29] = __byte_perm (w[25], w[24], selector); + w[28] = __byte_perm (w[24], w[23], selector); + w[27] = __byte_perm (w[23], w[22], selector); + w[26] = __byte_perm (w[22], w[21], selector); + w[25] = __byte_perm (w[21], w[20], selector); + w[24] = __byte_perm (w[20], w[19], selector); + w[23] = __byte_perm (w[19], w[18], selector); + w[22] = __byte_perm (w[18], w[17], selector); + w[21] = __byte_perm (w[17], w[16], selector); + w[20] = __byte_perm (w[16], w[15], selector); + w[19] = __byte_perm (w[15], w[14], selector); + w[18] = __byte_perm (w[14], w[13], selector); + w[17] = __byte_perm (w[13], w[12], selector); + w[16] = __byte_perm (w[12], w[11], selector); + w[15] = __byte_perm (w[11], w[10], selector); + w[14] = __byte_perm (w[10], w[ 9], selector); + w[13] = __byte_perm (w[ 9], w[ 8], selector); + w[12] = __byte_perm (w[ 8], w[ 7], selector); + w[11] = __byte_perm (w[ 7], w[ 6], selector); + w[10] = __byte_perm (w[ 6], w[ 5], selector); + w[ 9] = __byte_perm (w[ 5], w[ 4], selector); + w[ 8] = __byte_perm (w[ 4], w[ 3], selector); + w[ 7] = __byte_perm (w[ 3], w[ 2], selector); + w[ 6] = __byte_perm (w[ 2], w[ 1], selector); + w[ 5] = __byte_perm (w[ 1], w[ 0], selector); + w[ 4] = __byte_perm (w[ 0], 0, selector); + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 5: + w[63] = __byte_perm (w[58], w[57], selector); + w[62] = __byte_perm (w[57], w[56], selector); + w[61] = __byte_perm (w[56], w[55], selector); + w[60] = __byte_perm (w[55], w[54], selector); + w[59] = __byte_perm (w[54], w[53], selector); + w[58] = __byte_perm (w[53], w[52], selector); + w[57] = __byte_perm (w[52], w[51], selector); + w[56] = __byte_perm (w[51], w[50], selector); + w[55] = __byte_perm (w[50], w[49], selector); + w[54] = __byte_perm (w[49], w[48], selector); + w[53] = __byte_perm (w[48], w[47], selector); + w[52] = __byte_perm (w[47], w[46], selector); + w[51] = __byte_perm (w[46], w[45], selector); + w[50] = __byte_perm (w[45], w[44], selector); + w[49] = __byte_perm (w[44], w[43], selector); + w[48] = __byte_perm (w[43], w[42], selector); + w[47] = __byte_perm (w[42], w[41], selector); + w[46] = __byte_perm (w[41], w[40], selector); + w[45] = __byte_perm (w[40], w[39], selector); + w[44] = __byte_perm (w[39], w[38], selector); + w[43] = __byte_perm (w[38], w[37], selector); + w[42] = __byte_perm (w[37], w[36], selector); + w[41] = __byte_perm (w[36], w[35], selector); + w[40] = __byte_perm (w[35], w[34], selector); + w[39] = __byte_perm (w[34], w[33], selector); + w[38] = __byte_perm (w[33], w[32], selector); + w[37] = __byte_perm (w[32], w[31], selector); + w[36] = __byte_perm (w[31], w[30], selector); + w[35] = __byte_perm (w[30], w[29], selector); + w[34] = __byte_perm (w[29], w[28], selector); + w[33] = __byte_perm (w[28], w[27], selector); + w[32] = __byte_perm (w[27], w[26], selector); + w[31] = __byte_perm (w[26], w[25], selector); + w[30] = __byte_perm (w[25], w[24], selector); + w[29] = __byte_perm (w[24], w[23], selector); + w[28] = __byte_perm (w[23], w[22], selector); + w[27] = __byte_perm (w[22], w[21], selector); + w[26] = __byte_perm (w[21], w[20], selector); + w[25] = __byte_perm (w[20], w[19], selector); + w[24] = __byte_perm (w[19], w[18], selector); + w[23] = __byte_perm (w[18], w[17], selector); + w[22] = __byte_perm (w[17], w[16], selector); + w[21] = __byte_perm (w[16], w[15], selector); + w[20] = __byte_perm (w[15], w[14], selector); + w[19] = __byte_perm (w[14], w[13], selector); + w[18] = __byte_perm (w[13], w[12], selector); + w[17] = __byte_perm (w[12], w[11], selector); + w[16] = __byte_perm (w[11], w[10], selector); + w[15] = __byte_perm (w[10], w[ 9], selector); + w[14] = __byte_perm (w[ 9], w[ 8], selector); + w[13] = __byte_perm (w[ 8], w[ 7], selector); + w[12] = __byte_perm (w[ 7], w[ 6], selector); + w[11] = __byte_perm (w[ 6], w[ 5], selector); + w[10] = __byte_perm (w[ 5], w[ 4], selector); + w[ 9] = __byte_perm (w[ 4], w[ 3], selector); + w[ 8] = __byte_perm (w[ 3], w[ 2], selector); + w[ 7] = __byte_perm (w[ 2], w[ 1], selector); + w[ 6] = __byte_perm (w[ 1], w[ 0], selector); + w[ 5] = __byte_perm (w[ 0], 0, selector); + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 6: + w[63] = __byte_perm (w[57], w[56], selector); + w[62] = __byte_perm (w[56], w[55], selector); + w[61] = __byte_perm (w[55], w[54], selector); + w[60] = __byte_perm (w[54], w[53], selector); + w[59] = __byte_perm (w[53], w[52], selector); + w[58] = __byte_perm (w[52], w[51], selector); + w[57] = __byte_perm (w[51], w[50], selector); + w[56] = __byte_perm (w[50], w[49], selector); + w[55] = __byte_perm (w[49], w[48], selector); + w[54] = __byte_perm (w[48], w[47], selector); + w[53] = __byte_perm (w[47], w[46], selector); + w[52] = __byte_perm (w[46], w[45], selector); + w[51] = __byte_perm (w[45], w[44], selector); + w[50] = __byte_perm (w[44], w[43], selector); + w[49] = __byte_perm (w[43], w[42], selector); + w[48] = __byte_perm (w[42], w[41], selector); + w[47] = __byte_perm (w[41], w[40], selector); + w[46] = __byte_perm (w[40], w[39], selector); + w[45] = __byte_perm (w[39], w[38], selector); + w[44] = __byte_perm (w[38], w[37], selector); + w[43] = __byte_perm (w[37], w[36], selector); + w[42] = __byte_perm (w[36], w[35], selector); + w[41] = __byte_perm (w[35], w[34], selector); + w[40] = __byte_perm (w[34], w[33], selector); + w[39] = __byte_perm (w[33], w[32], selector); + w[38] = __byte_perm (w[32], w[31], selector); + w[37] = __byte_perm (w[31], w[30], selector); + w[36] = __byte_perm (w[30], w[29], selector); + w[35] = __byte_perm (w[29], w[28], selector); + w[34] = __byte_perm (w[28], w[27], selector); + w[33] = __byte_perm (w[27], w[26], selector); + w[32] = __byte_perm (w[26], w[25], selector); + w[31] = __byte_perm (w[25], w[24], selector); + w[30] = __byte_perm (w[24], w[23], selector); + w[29] = __byte_perm (w[23], w[22], selector); + w[28] = __byte_perm (w[22], w[21], selector); + w[27] = __byte_perm (w[21], w[20], selector); + w[26] = __byte_perm (w[20], w[19], selector); + w[25] = __byte_perm (w[19], w[18], selector); + w[24] = __byte_perm (w[18], w[17], selector); + w[23] = __byte_perm (w[17], w[16], selector); + w[22] = __byte_perm (w[16], w[15], selector); + w[21] = __byte_perm (w[15], w[14], selector); + w[20] = __byte_perm (w[14], w[13], selector); + w[19] = __byte_perm (w[13], w[12], selector); + w[18] = __byte_perm (w[12], w[11], selector); + w[17] = __byte_perm (w[11], w[10], selector); + w[16] = __byte_perm (w[10], w[ 9], selector); + w[15] = __byte_perm (w[ 9], w[ 8], selector); + w[14] = __byte_perm (w[ 8], w[ 7], selector); + w[13] = __byte_perm (w[ 7], w[ 6], selector); + w[12] = __byte_perm (w[ 6], w[ 5], selector); + w[11] = __byte_perm (w[ 5], w[ 4], selector); + w[10] = __byte_perm (w[ 4], w[ 3], selector); + w[ 9] = __byte_perm (w[ 3], w[ 2], selector); + w[ 8] = __byte_perm (w[ 2], w[ 1], selector); + w[ 7] = __byte_perm (w[ 1], w[ 0], selector); + w[ 6] = __byte_perm (w[ 0], 0, selector); + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 7: + w[63] = __byte_perm (w[56], w[55], selector); + w[62] = __byte_perm (w[55], w[54], selector); + w[61] = __byte_perm (w[54], w[53], selector); + w[60] = __byte_perm (w[53], w[52], selector); + w[59] = __byte_perm (w[52], w[51], selector); + w[58] = __byte_perm (w[51], w[50], selector); + w[57] = __byte_perm (w[50], w[49], selector); + w[56] = __byte_perm (w[49], w[48], selector); + w[55] = __byte_perm (w[48], w[47], selector); + w[54] = __byte_perm (w[47], w[46], selector); + w[53] = __byte_perm (w[46], w[45], selector); + w[52] = __byte_perm (w[45], w[44], selector); + w[51] = __byte_perm (w[44], w[43], selector); + w[50] = __byte_perm (w[43], w[42], selector); + w[49] = __byte_perm (w[42], w[41], selector); + w[48] = __byte_perm (w[41], w[40], selector); + w[47] = __byte_perm (w[40], w[39], selector); + w[46] = __byte_perm (w[39], w[38], selector); + w[45] = __byte_perm (w[38], w[37], selector); + w[44] = __byte_perm (w[37], w[36], selector); + w[43] = __byte_perm (w[36], w[35], selector); + w[42] = __byte_perm (w[35], w[34], selector); + w[41] = __byte_perm (w[34], w[33], selector); + w[40] = __byte_perm (w[33], w[32], selector); + w[39] = __byte_perm (w[32], w[31], selector); + w[38] = __byte_perm (w[31], w[30], selector); + w[37] = __byte_perm (w[30], w[29], selector); + w[36] = __byte_perm (w[29], w[28], selector); + w[35] = __byte_perm (w[28], w[27], selector); + w[34] = __byte_perm (w[27], w[26], selector); + w[33] = __byte_perm (w[26], w[25], selector); + w[32] = __byte_perm (w[25], w[24], selector); + w[31] = __byte_perm (w[24], w[23], selector); + w[30] = __byte_perm (w[23], w[22], selector); + w[29] = __byte_perm (w[22], w[21], selector); + w[28] = __byte_perm (w[21], w[20], selector); + w[27] = __byte_perm (w[20], w[19], selector); + w[26] = __byte_perm (w[19], w[18], selector); + w[25] = __byte_perm (w[18], w[17], selector); + w[24] = __byte_perm (w[17], w[16], selector); + w[23] = __byte_perm (w[16], w[15], selector); + w[22] = __byte_perm (w[15], w[14], selector); + w[21] = __byte_perm (w[14], w[13], selector); + w[20] = __byte_perm (w[13], w[12], selector); + w[19] = __byte_perm (w[12], w[11], selector); + w[18] = __byte_perm (w[11], w[10], selector); + w[17] = __byte_perm (w[10], w[ 9], selector); + w[16] = __byte_perm (w[ 9], w[ 8], selector); + w[15] = __byte_perm (w[ 8], w[ 7], selector); + w[14] = __byte_perm (w[ 7], w[ 6], selector); + w[13] = __byte_perm (w[ 6], w[ 5], selector); + w[12] = __byte_perm (w[ 5], w[ 4], selector); + w[11] = __byte_perm (w[ 4], w[ 3], selector); + w[10] = __byte_perm (w[ 3], w[ 2], selector); + w[ 9] = __byte_perm (w[ 2], w[ 1], selector); + w[ 8] = __byte_perm (w[ 1], w[ 0], selector); + w[ 7] = __byte_perm (w[ 0], 0, selector); + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 8: + w[63] = __byte_perm (w[55], w[54], selector); + w[62] = __byte_perm (w[54], w[53], selector); + w[61] = __byte_perm (w[53], w[52], selector); + w[60] = __byte_perm (w[52], w[51], selector); + w[59] = __byte_perm (w[51], w[50], selector); + w[58] = __byte_perm (w[50], w[49], selector); + w[57] = __byte_perm (w[49], w[48], selector); + w[56] = __byte_perm (w[48], w[47], selector); + w[55] = __byte_perm (w[47], w[46], selector); + w[54] = __byte_perm (w[46], w[45], selector); + w[53] = __byte_perm (w[45], w[44], selector); + w[52] = __byte_perm (w[44], w[43], selector); + w[51] = __byte_perm (w[43], w[42], selector); + w[50] = __byte_perm (w[42], w[41], selector); + w[49] = __byte_perm (w[41], w[40], selector); + w[48] = __byte_perm (w[40], w[39], selector); + w[47] = __byte_perm (w[39], w[38], selector); + w[46] = __byte_perm (w[38], w[37], selector); + w[45] = __byte_perm (w[37], w[36], selector); + w[44] = __byte_perm (w[36], w[35], selector); + w[43] = __byte_perm (w[35], w[34], selector); + w[42] = __byte_perm (w[34], w[33], selector); + w[41] = __byte_perm (w[33], w[32], selector); + w[40] = __byte_perm (w[32], w[31], selector); + w[39] = __byte_perm (w[31], w[30], selector); + w[38] = __byte_perm (w[30], w[29], selector); + w[37] = __byte_perm (w[29], w[28], selector); + w[36] = __byte_perm (w[28], w[27], selector); + w[35] = __byte_perm (w[27], w[26], selector); + w[34] = __byte_perm (w[26], w[25], selector); + w[33] = __byte_perm (w[25], w[24], selector); + w[32] = __byte_perm (w[24], w[23], selector); + w[31] = __byte_perm (w[23], w[22], selector); + w[30] = __byte_perm (w[22], w[21], selector); + w[29] = __byte_perm (w[21], w[20], selector); + w[28] = __byte_perm (w[20], w[19], selector); + w[27] = __byte_perm (w[19], w[18], selector); + w[26] = __byte_perm (w[18], w[17], selector); + w[25] = __byte_perm (w[17], w[16], selector); + w[24] = __byte_perm (w[16], w[15], selector); + w[23] = __byte_perm (w[15], w[14], selector); + w[22] = __byte_perm (w[14], w[13], selector); + w[21] = __byte_perm (w[13], w[12], selector); + w[20] = __byte_perm (w[12], w[11], selector); + w[19] = __byte_perm (w[11], w[10], selector); + w[18] = __byte_perm (w[10], w[ 9], selector); + w[17] = __byte_perm (w[ 9], w[ 8], selector); + w[16] = __byte_perm (w[ 8], w[ 7], selector); + w[15] = __byte_perm (w[ 7], w[ 6], selector); + w[14] = __byte_perm (w[ 6], w[ 5], selector); + w[13] = __byte_perm (w[ 5], w[ 4], selector); + w[12] = __byte_perm (w[ 4], w[ 3], selector); + w[11] = __byte_perm (w[ 3], w[ 2], selector); + w[10] = __byte_perm (w[ 2], w[ 1], selector); + w[ 9] = __byte_perm (w[ 1], w[ 0], selector); + w[ 8] = __byte_perm (w[ 0], 0, selector); + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 9: + w[63] = __byte_perm (w[54], w[53], selector); + w[62] = __byte_perm (w[53], w[52], selector); + w[61] = __byte_perm (w[52], w[51], selector); + w[60] = __byte_perm (w[51], w[50], selector); + w[59] = __byte_perm (w[50], w[49], selector); + w[58] = __byte_perm (w[49], w[48], selector); + w[57] = __byte_perm (w[48], w[47], selector); + w[56] = __byte_perm (w[47], w[46], selector); + w[55] = __byte_perm (w[46], w[45], selector); + w[54] = __byte_perm (w[45], w[44], selector); + w[53] = __byte_perm (w[44], w[43], selector); + w[52] = __byte_perm (w[43], w[42], selector); + w[51] = __byte_perm (w[42], w[41], selector); + w[50] = __byte_perm (w[41], w[40], selector); + w[49] = __byte_perm (w[40], w[39], selector); + w[48] = __byte_perm (w[39], w[38], selector); + w[47] = __byte_perm (w[38], w[37], selector); + w[46] = __byte_perm (w[37], w[36], selector); + w[45] = __byte_perm (w[36], w[35], selector); + w[44] = __byte_perm (w[35], w[34], selector); + w[43] = __byte_perm (w[34], w[33], selector); + w[42] = __byte_perm (w[33], w[32], selector); + w[41] = __byte_perm (w[32], w[31], selector); + w[40] = __byte_perm (w[31], w[30], selector); + w[39] = __byte_perm (w[30], w[29], selector); + w[38] = __byte_perm (w[29], w[28], selector); + w[37] = __byte_perm (w[28], w[27], selector); + w[36] = __byte_perm (w[27], w[26], selector); + w[35] = __byte_perm (w[26], w[25], selector); + w[34] = __byte_perm (w[25], w[24], selector); + w[33] = __byte_perm (w[24], w[23], selector); + w[32] = __byte_perm (w[23], w[22], selector); + w[31] = __byte_perm (w[22], w[21], selector); + w[30] = __byte_perm (w[21], w[20], selector); + w[29] = __byte_perm (w[20], w[19], selector); + w[28] = __byte_perm (w[19], w[18], selector); + w[27] = __byte_perm (w[18], w[17], selector); + w[26] = __byte_perm (w[17], w[16], selector); + w[25] = __byte_perm (w[16], w[15], selector); + w[24] = __byte_perm (w[15], w[14], selector); + w[23] = __byte_perm (w[14], w[13], selector); + w[22] = __byte_perm (w[13], w[12], selector); + w[21] = __byte_perm (w[12], w[11], selector); + w[20] = __byte_perm (w[11], w[10], selector); + w[19] = __byte_perm (w[10], w[ 9], selector); + w[18] = __byte_perm (w[ 9], w[ 8], selector); + w[17] = __byte_perm (w[ 8], w[ 7], selector); + w[16] = __byte_perm (w[ 7], w[ 6], selector); + w[15] = __byte_perm (w[ 6], w[ 5], selector); + w[14] = __byte_perm (w[ 5], w[ 4], selector); + w[13] = __byte_perm (w[ 4], w[ 3], selector); + w[12] = __byte_perm (w[ 3], w[ 2], selector); + w[11] = __byte_perm (w[ 2], w[ 1], selector); + w[10] = __byte_perm (w[ 1], w[ 0], selector); + w[ 9] = __byte_perm (w[ 0], 0, selector); + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 10: + w[63] = __byte_perm (w[53], w[52], selector); + w[62] = __byte_perm (w[52], w[51], selector); + w[61] = __byte_perm (w[51], w[50], selector); + w[60] = __byte_perm (w[50], w[49], selector); + w[59] = __byte_perm (w[49], w[48], selector); + w[58] = __byte_perm (w[48], w[47], selector); + w[57] = __byte_perm (w[47], w[46], selector); + w[56] = __byte_perm (w[46], w[45], selector); + w[55] = __byte_perm (w[45], w[44], selector); + w[54] = __byte_perm (w[44], w[43], selector); + w[53] = __byte_perm (w[43], w[42], selector); + w[52] = __byte_perm (w[42], w[41], selector); + w[51] = __byte_perm (w[41], w[40], selector); + w[50] = __byte_perm (w[40], w[39], selector); + w[49] = __byte_perm (w[39], w[38], selector); + w[48] = __byte_perm (w[38], w[37], selector); + w[47] = __byte_perm (w[37], w[36], selector); + w[46] = __byte_perm (w[36], w[35], selector); + w[45] = __byte_perm (w[35], w[34], selector); + w[44] = __byte_perm (w[34], w[33], selector); + w[43] = __byte_perm (w[33], w[32], selector); + w[42] = __byte_perm (w[32], w[31], selector); + w[41] = __byte_perm (w[31], w[30], selector); + w[40] = __byte_perm (w[30], w[29], selector); + w[39] = __byte_perm (w[29], w[28], selector); + w[38] = __byte_perm (w[28], w[27], selector); + w[37] = __byte_perm (w[27], w[26], selector); + w[36] = __byte_perm (w[26], w[25], selector); + w[35] = __byte_perm (w[25], w[24], selector); + w[34] = __byte_perm (w[24], w[23], selector); + w[33] = __byte_perm (w[23], w[22], selector); + w[32] = __byte_perm (w[22], w[21], selector); + w[31] = __byte_perm (w[21], w[20], selector); + w[30] = __byte_perm (w[20], w[19], selector); + w[29] = __byte_perm (w[19], w[18], selector); + w[28] = __byte_perm (w[18], w[17], selector); + w[27] = __byte_perm (w[17], w[16], selector); + w[26] = __byte_perm (w[16], w[15], selector); + w[25] = __byte_perm (w[15], w[14], selector); + w[24] = __byte_perm (w[14], w[13], selector); + w[23] = __byte_perm (w[13], w[12], selector); + w[22] = __byte_perm (w[12], w[11], selector); + w[21] = __byte_perm (w[11], w[10], selector); + w[20] = __byte_perm (w[10], w[ 9], selector); + w[19] = __byte_perm (w[ 9], w[ 8], selector); + w[18] = __byte_perm (w[ 8], w[ 7], selector); + w[17] = __byte_perm (w[ 7], w[ 6], selector); + w[16] = __byte_perm (w[ 6], w[ 5], selector); + w[15] = __byte_perm (w[ 5], w[ 4], selector); + w[14] = __byte_perm (w[ 4], w[ 3], selector); + w[13] = __byte_perm (w[ 3], w[ 2], selector); + w[12] = __byte_perm (w[ 2], w[ 1], selector); + w[11] = __byte_perm (w[ 1], w[ 0], selector); + w[10] = __byte_perm (w[ 0], 0, selector); + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 11: + w[63] = __byte_perm (w[52], w[51], selector); + w[62] = __byte_perm (w[51], w[50], selector); + w[61] = __byte_perm (w[50], w[49], selector); + w[60] = __byte_perm (w[49], w[48], selector); + w[59] = __byte_perm (w[48], w[47], selector); + w[58] = __byte_perm (w[47], w[46], selector); + w[57] = __byte_perm (w[46], w[45], selector); + w[56] = __byte_perm (w[45], w[44], selector); + w[55] = __byte_perm (w[44], w[43], selector); + w[54] = __byte_perm (w[43], w[42], selector); + w[53] = __byte_perm (w[42], w[41], selector); + w[52] = __byte_perm (w[41], w[40], selector); + w[51] = __byte_perm (w[40], w[39], selector); + w[50] = __byte_perm (w[39], w[38], selector); + w[49] = __byte_perm (w[38], w[37], selector); + w[48] = __byte_perm (w[37], w[36], selector); + w[47] = __byte_perm (w[36], w[35], selector); + w[46] = __byte_perm (w[35], w[34], selector); + w[45] = __byte_perm (w[34], w[33], selector); + w[44] = __byte_perm (w[33], w[32], selector); + w[43] = __byte_perm (w[32], w[31], selector); + w[42] = __byte_perm (w[31], w[30], selector); + w[41] = __byte_perm (w[30], w[29], selector); + w[40] = __byte_perm (w[29], w[28], selector); + w[39] = __byte_perm (w[28], w[27], selector); + w[38] = __byte_perm (w[27], w[26], selector); + w[37] = __byte_perm (w[26], w[25], selector); + w[36] = __byte_perm (w[25], w[24], selector); + w[35] = __byte_perm (w[24], w[23], selector); + w[34] = __byte_perm (w[23], w[22], selector); + w[33] = __byte_perm (w[22], w[21], selector); + w[32] = __byte_perm (w[21], w[20], selector); + w[31] = __byte_perm (w[20], w[19], selector); + w[30] = __byte_perm (w[19], w[18], selector); + w[29] = __byte_perm (w[18], w[17], selector); + w[28] = __byte_perm (w[17], w[16], selector); + w[27] = __byte_perm (w[16], w[15], selector); + w[26] = __byte_perm (w[15], w[14], selector); + w[25] = __byte_perm (w[14], w[13], selector); + w[24] = __byte_perm (w[13], w[12], selector); + w[23] = __byte_perm (w[12], w[11], selector); + w[22] = __byte_perm (w[11], w[10], selector); + w[21] = __byte_perm (w[10], w[ 9], selector); + w[20] = __byte_perm (w[ 9], w[ 8], selector); + w[19] = __byte_perm (w[ 8], w[ 7], selector); + w[18] = __byte_perm (w[ 7], w[ 6], selector); + w[17] = __byte_perm (w[ 6], w[ 5], selector); + w[16] = __byte_perm (w[ 5], w[ 4], selector); + w[15] = __byte_perm (w[ 4], w[ 3], selector); + w[14] = __byte_perm (w[ 3], w[ 2], selector); + w[13] = __byte_perm (w[ 2], w[ 1], selector); + w[12] = __byte_perm (w[ 1], w[ 0], selector); + w[11] = __byte_perm (w[ 0], 0, selector); + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 12: + w[63] = __byte_perm (w[51], w[50], selector); + w[62] = __byte_perm (w[50], w[49], selector); + w[61] = __byte_perm (w[49], w[48], selector); + w[60] = __byte_perm (w[48], w[47], selector); + w[59] = __byte_perm (w[47], w[46], selector); + w[58] = __byte_perm (w[46], w[45], selector); + w[57] = __byte_perm (w[45], w[44], selector); + w[56] = __byte_perm (w[44], w[43], selector); + w[55] = __byte_perm (w[43], w[42], selector); + w[54] = __byte_perm (w[42], w[41], selector); + w[53] = __byte_perm (w[41], w[40], selector); + w[52] = __byte_perm (w[40], w[39], selector); + w[51] = __byte_perm (w[39], w[38], selector); + w[50] = __byte_perm (w[38], w[37], selector); + w[49] = __byte_perm (w[37], w[36], selector); + w[48] = __byte_perm (w[36], w[35], selector); + w[47] = __byte_perm (w[35], w[34], selector); + w[46] = __byte_perm (w[34], w[33], selector); + w[45] = __byte_perm (w[33], w[32], selector); + w[44] = __byte_perm (w[32], w[31], selector); + w[43] = __byte_perm (w[31], w[30], selector); + w[42] = __byte_perm (w[30], w[29], selector); + w[41] = __byte_perm (w[29], w[28], selector); + w[40] = __byte_perm (w[28], w[27], selector); + w[39] = __byte_perm (w[27], w[26], selector); + w[38] = __byte_perm (w[26], w[25], selector); + w[37] = __byte_perm (w[25], w[24], selector); + w[36] = __byte_perm (w[24], w[23], selector); + w[35] = __byte_perm (w[23], w[22], selector); + w[34] = __byte_perm (w[22], w[21], selector); + w[33] = __byte_perm (w[21], w[20], selector); + w[32] = __byte_perm (w[20], w[19], selector); + w[31] = __byte_perm (w[19], w[18], selector); + w[30] = __byte_perm (w[18], w[17], selector); + w[29] = __byte_perm (w[17], w[16], selector); + w[28] = __byte_perm (w[16], w[15], selector); + w[27] = __byte_perm (w[15], w[14], selector); + w[26] = __byte_perm (w[14], w[13], selector); + w[25] = __byte_perm (w[13], w[12], selector); + w[24] = __byte_perm (w[12], w[11], selector); + w[23] = __byte_perm (w[11], w[10], selector); + w[22] = __byte_perm (w[10], w[ 9], selector); + w[21] = __byte_perm (w[ 9], w[ 8], selector); + w[20] = __byte_perm (w[ 8], w[ 7], selector); + w[19] = __byte_perm (w[ 7], w[ 6], selector); + w[18] = __byte_perm (w[ 6], w[ 5], selector); + w[17] = __byte_perm (w[ 5], w[ 4], selector); + w[16] = __byte_perm (w[ 4], w[ 3], selector); + w[15] = __byte_perm (w[ 3], w[ 2], selector); + w[14] = __byte_perm (w[ 2], w[ 1], selector); + w[13] = __byte_perm (w[ 1], w[ 0], selector); + w[12] = __byte_perm (w[ 0], 0, selector); + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 13: + w[63] = __byte_perm (w[50], w[49], selector); + w[62] = __byte_perm (w[49], w[48], selector); + w[61] = __byte_perm (w[48], w[47], selector); + w[60] = __byte_perm (w[47], w[46], selector); + w[59] = __byte_perm (w[46], w[45], selector); + w[58] = __byte_perm (w[45], w[44], selector); + w[57] = __byte_perm (w[44], w[43], selector); + w[56] = __byte_perm (w[43], w[42], selector); + w[55] = __byte_perm (w[42], w[41], selector); + w[54] = __byte_perm (w[41], w[40], selector); + w[53] = __byte_perm (w[40], w[39], selector); + w[52] = __byte_perm (w[39], w[38], selector); + w[51] = __byte_perm (w[38], w[37], selector); + w[50] = __byte_perm (w[37], w[36], selector); + w[49] = __byte_perm (w[36], w[35], selector); + w[48] = __byte_perm (w[35], w[34], selector); + w[47] = __byte_perm (w[34], w[33], selector); + w[46] = __byte_perm (w[33], w[32], selector); + w[45] = __byte_perm (w[32], w[31], selector); + w[44] = __byte_perm (w[31], w[30], selector); + w[43] = __byte_perm (w[30], w[29], selector); + w[42] = __byte_perm (w[29], w[28], selector); + w[41] = __byte_perm (w[28], w[27], selector); + w[40] = __byte_perm (w[27], w[26], selector); + w[39] = __byte_perm (w[26], w[25], selector); + w[38] = __byte_perm (w[25], w[24], selector); + w[37] = __byte_perm (w[24], w[23], selector); + w[36] = __byte_perm (w[23], w[22], selector); + w[35] = __byte_perm (w[22], w[21], selector); + w[34] = __byte_perm (w[21], w[20], selector); + w[33] = __byte_perm (w[20], w[19], selector); + w[32] = __byte_perm (w[19], w[18], selector); + w[31] = __byte_perm (w[18], w[17], selector); + w[30] = __byte_perm (w[17], w[16], selector); + w[29] = __byte_perm (w[16], w[15], selector); + w[28] = __byte_perm (w[15], w[14], selector); + w[27] = __byte_perm (w[14], w[13], selector); + w[26] = __byte_perm (w[13], w[12], selector); + w[25] = __byte_perm (w[12], w[11], selector); + w[24] = __byte_perm (w[11], w[10], selector); + w[23] = __byte_perm (w[10], w[ 9], selector); + w[22] = __byte_perm (w[ 9], w[ 8], selector); + w[21] = __byte_perm (w[ 8], w[ 7], selector); + w[20] = __byte_perm (w[ 7], w[ 6], selector); + w[19] = __byte_perm (w[ 6], w[ 5], selector); + w[18] = __byte_perm (w[ 5], w[ 4], selector); + w[17] = __byte_perm (w[ 4], w[ 3], selector); + w[16] = __byte_perm (w[ 3], w[ 2], selector); + w[15] = __byte_perm (w[ 2], w[ 1], selector); + w[14] = __byte_perm (w[ 1], w[ 0], selector); + w[13] = __byte_perm (w[ 0], 0, selector); + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 14: + w[63] = __byte_perm (w[49], w[48], selector); + w[62] = __byte_perm (w[48], w[47], selector); + w[61] = __byte_perm (w[47], w[46], selector); + w[60] = __byte_perm (w[46], w[45], selector); + w[59] = __byte_perm (w[45], w[44], selector); + w[58] = __byte_perm (w[44], w[43], selector); + w[57] = __byte_perm (w[43], w[42], selector); + w[56] = __byte_perm (w[42], w[41], selector); + w[55] = __byte_perm (w[41], w[40], selector); + w[54] = __byte_perm (w[40], w[39], selector); + w[53] = __byte_perm (w[39], w[38], selector); + w[52] = __byte_perm (w[38], w[37], selector); + w[51] = __byte_perm (w[37], w[36], selector); + w[50] = __byte_perm (w[36], w[35], selector); + w[49] = __byte_perm (w[35], w[34], selector); + w[48] = __byte_perm (w[34], w[33], selector); + w[47] = __byte_perm (w[33], w[32], selector); + w[46] = __byte_perm (w[32], w[31], selector); + w[45] = __byte_perm (w[31], w[30], selector); + w[44] = __byte_perm (w[30], w[29], selector); + w[43] = __byte_perm (w[29], w[28], selector); + w[42] = __byte_perm (w[28], w[27], selector); + w[41] = __byte_perm (w[27], w[26], selector); + w[40] = __byte_perm (w[26], w[25], selector); + w[39] = __byte_perm (w[25], w[24], selector); + w[38] = __byte_perm (w[24], w[23], selector); + w[37] = __byte_perm (w[23], w[22], selector); + w[36] = __byte_perm (w[22], w[21], selector); + w[35] = __byte_perm (w[21], w[20], selector); + w[34] = __byte_perm (w[20], w[19], selector); + w[33] = __byte_perm (w[19], w[18], selector); + w[32] = __byte_perm (w[18], w[17], selector); + w[31] = __byte_perm (w[17], w[16], selector); + w[30] = __byte_perm (w[16], w[15], selector); + w[29] = __byte_perm (w[15], w[14], selector); + w[28] = __byte_perm (w[14], w[13], selector); + w[27] = __byte_perm (w[13], w[12], selector); + w[26] = __byte_perm (w[12], w[11], selector); + w[25] = __byte_perm (w[11], w[10], selector); + w[24] = __byte_perm (w[10], w[ 9], selector); + w[23] = __byte_perm (w[ 9], w[ 8], selector); + w[22] = __byte_perm (w[ 8], w[ 7], selector); + w[21] = __byte_perm (w[ 7], w[ 6], selector); + w[20] = __byte_perm (w[ 6], w[ 5], selector); + w[19] = __byte_perm (w[ 5], w[ 4], selector); + w[18] = __byte_perm (w[ 4], w[ 3], selector); + w[17] = __byte_perm (w[ 3], w[ 2], selector); + w[16] = __byte_perm (w[ 2], w[ 1], selector); + w[15] = __byte_perm (w[ 1], w[ 0], selector); + w[14] = __byte_perm (w[ 0], 0, selector); + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 15: + w[63] = __byte_perm (w[48], w[47], selector); + w[62] = __byte_perm (w[47], w[46], selector); + w[61] = __byte_perm (w[46], w[45], selector); + w[60] = __byte_perm (w[45], w[44], selector); + w[59] = __byte_perm (w[44], w[43], selector); + w[58] = __byte_perm (w[43], w[42], selector); + w[57] = __byte_perm (w[42], w[41], selector); + w[56] = __byte_perm (w[41], w[40], selector); + w[55] = __byte_perm (w[40], w[39], selector); + w[54] = __byte_perm (w[39], w[38], selector); + w[53] = __byte_perm (w[38], w[37], selector); + w[52] = __byte_perm (w[37], w[36], selector); + w[51] = __byte_perm (w[36], w[35], selector); + w[50] = __byte_perm (w[35], w[34], selector); + w[49] = __byte_perm (w[34], w[33], selector); + w[48] = __byte_perm (w[33], w[32], selector); + w[47] = __byte_perm (w[32], w[31], selector); + w[46] = __byte_perm (w[31], w[30], selector); + w[45] = __byte_perm (w[30], w[29], selector); + w[44] = __byte_perm (w[29], w[28], selector); + w[43] = __byte_perm (w[28], w[27], selector); + w[42] = __byte_perm (w[27], w[26], selector); + w[41] = __byte_perm (w[26], w[25], selector); + w[40] = __byte_perm (w[25], w[24], selector); + w[39] = __byte_perm (w[24], w[23], selector); + w[38] = __byte_perm (w[23], w[22], selector); + w[37] = __byte_perm (w[22], w[21], selector); + w[36] = __byte_perm (w[21], w[20], selector); + w[35] = __byte_perm (w[20], w[19], selector); + w[34] = __byte_perm (w[19], w[18], selector); + w[33] = __byte_perm (w[18], w[17], selector); + w[32] = __byte_perm (w[17], w[16], selector); + w[31] = __byte_perm (w[16], w[15], selector); + w[30] = __byte_perm (w[15], w[14], selector); + w[29] = __byte_perm (w[14], w[13], selector); + w[28] = __byte_perm (w[13], w[12], selector); + w[27] = __byte_perm (w[12], w[11], selector); + w[26] = __byte_perm (w[11], w[10], selector); + w[25] = __byte_perm (w[10], w[ 9], selector); + w[24] = __byte_perm (w[ 9], w[ 8], selector); + w[23] = __byte_perm (w[ 8], w[ 7], selector); + w[22] = __byte_perm (w[ 7], w[ 6], selector); + w[21] = __byte_perm (w[ 6], w[ 5], selector); + w[20] = __byte_perm (w[ 5], w[ 4], selector); + w[19] = __byte_perm (w[ 4], w[ 3], selector); + w[18] = __byte_perm (w[ 3], w[ 2], selector); + w[17] = __byte_perm (w[ 2], w[ 1], selector); + w[16] = __byte_perm (w[ 1], w[ 0], selector); + w[15] = __byte_perm (w[ 0], 0, selector); + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 16: + w[63] = __byte_perm (w[47], w[46], selector); + w[62] = __byte_perm (w[46], w[45], selector); + w[61] = __byte_perm (w[45], w[44], selector); + w[60] = __byte_perm (w[44], w[43], selector); + w[59] = __byte_perm (w[43], w[42], selector); + w[58] = __byte_perm (w[42], w[41], selector); + w[57] = __byte_perm (w[41], w[40], selector); + w[56] = __byte_perm (w[40], w[39], selector); + w[55] = __byte_perm (w[39], w[38], selector); + w[54] = __byte_perm (w[38], w[37], selector); + w[53] = __byte_perm (w[37], w[36], selector); + w[52] = __byte_perm (w[36], w[35], selector); + w[51] = __byte_perm (w[35], w[34], selector); + w[50] = __byte_perm (w[34], w[33], selector); + w[49] = __byte_perm (w[33], w[32], selector); + w[48] = __byte_perm (w[32], w[31], selector); + w[47] = __byte_perm (w[31], w[30], selector); + w[46] = __byte_perm (w[30], w[29], selector); + w[45] = __byte_perm (w[29], w[28], selector); + w[44] = __byte_perm (w[28], w[27], selector); + w[43] = __byte_perm (w[27], w[26], selector); + w[42] = __byte_perm (w[26], w[25], selector); + w[41] = __byte_perm (w[25], w[24], selector); + w[40] = __byte_perm (w[24], w[23], selector); + w[39] = __byte_perm (w[23], w[22], selector); + w[38] = __byte_perm (w[22], w[21], selector); + w[37] = __byte_perm (w[21], w[20], selector); + w[36] = __byte_perm (w[20], w[19], selector); + w[35] = __byte_perm (w[19], w[18], selector); + w[34] = __byte_perm (w[18], w[17], selector); + w[33] = __byte_perm (w[17], w[16], selector); + w[32] = __byte_perm (w[16], w[15], selector); + w[31] = __byte_perm (w[15], w[14], selector); + w[30] = __byte_perm (w[14], w[13], selector); + w[29] = __byte_perm (w[13], w[12], selector); + w[28] = __byte_perm (w[12], w[11], selector); + w[27] = __byte_perm (w[11], w[10], selector); + w[26] = __byte_perm (w[10], w[ 9], selector); + w[25] = __byte_perm (w[ 9], w[ 8], selector); + w[24] = __byte_perm (w[ 8], w[ 7], selector); + w[23] = __byte_perm (w[ 7], w[ 6], selector); + w[22] = __byte_perm (w[ 6], w[ 5], selector); + w[21] = __byte_perm (w[ 5], w[ 4], selector); + w[20] = __byte_perm (w[ 4], w[ 3], selector); + w[19] = __byte_perm (w[ 3], w[ 2], selector); + w[18] = __byte_perm (w[ 2], w[ 1], selector); + w[17] = __byte_perm (w[ 1], w[ 0], selector); + w[16] = __byte_perm (w[ 0], 0, selector); + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 17: + w[63] = __byte_perm (w[46], w[45], selector); + w[62] = __byte_perm (w[45], w[44], selector); + w[61] = __byte_perm (w[44], w[43], selector); + w[60] = __byte_perm (w[43], w[42], selector); + w[59] = __byte_perm (w[42], w[41], selector); + w[58] = __byte_perm (w[41], w[40], selector); + w[57] = __byte_perm (w[40], w[39], selector); + w[56] = __byte_perm (w[39], w[38], selector); + w[55] = __byte_perm (w[38], w[37], selector); + w[54] = __byte_perm (w[37], w[36], selector); + w[53] = __byte_perm (w[36], w[35], selector); + w[52] = __byte_perm (w[35], w[34], selector); + w[51] = __byte_perm (w[34], w[33], selector); + w[50] = __byte_perm (w[33], w[32], selector); + w[49] = __byte_perm (w[32], w[31], selector); + w[48] = __byte_perm (w[31], w[30], selector); + w[47] = __byte_perm (w[30], w[29], selector); + w[46] = __byte_perm (w[29], w[28], selector); + w[45] = __byte_perm (w[28], w[27], selector); + w[44] = __byte_perm (w[27], w[26], selector); + w[43] = __byte_perm (w[26], w[25], selector); + w[42] = __byte_perm (w[25], w[24], selector); + w[41] = __byte_perm (w[24], w[23], selector); + w[40] = __byte_perm (w[23], w[22], selector); + w[39] = __byte_perm (w[22], w[21], selector); + w[38] = __byte_perm (w[21], w[20], selector); + w[37] = __byte_perm (w[20], w[19], selector); + w[36] = __byte_perm (w[19], w[18], selector); + w[35] = __byte_perm (w[18], w[17], selector); + w[34] = __byte_perm (w[17], w[16], selector); + w[33] = __byte_perm (w[16], w[15], selector); + w[32] = __byte_perm (w[15], w[14], selector); + w[31] = __byte_perm (w[14], w[13], selector); + w[30] = __byte_perm (w[13], w[12], selector); + w[29] = __byte_perm (w[12], w[11], selector); + w[28] = __byte_perm (w[11], w[10], selector); + w[27] = __byte_perm (w[10], w[ 9], selector); + w[26] = __byte_perm (w[ 9], w[ 8], selector); + w[25] = __byte_perm (w[ 8], w[ 7], selector); + w[24] = __byte_perm (w[ 7], w[ 6], selector); + w[23] = __byte_perm (w[ 6], w[ 5], selector); + w[22] = __byte_perm (w[ 5], w[ 4], selector); + w[21] = __byte_perm (w[ 4], w[ 3], selector); + w[20] = __byte_perm (w[ 3], w[ 2], selector); + w[19] = __byte_perm (w[ 2], w[ 1], selector); + w[18] = __byte_perm (w[ 1], w[ 0], selector); + w[17] = __byte_perm (w[ 0], 0, selector); + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 18: + w[63] = __byte_perm (w[45], w[44], selector); + w[62] = __byte_perm (w[44], w[43], selector); + w[61] = __byte_perm (w[43], w[42], selector); + w[60] = __byte_perm (w[42], w[41], selector); + w[59] = __byte_perm (w[41], w[40], selector); + w[58] = __byte_perm (w[40], w[39], selector); + w[57] = __byte_perm (w[39], w[38], selector); + w[56] = __byte_perm (w[38], w[37], selector); + w[55] = __byte_perm (w[37], w[36], selector); + w[54] = __byte_perm (w[36], w[35], selector); + w[53] = __byte_perm (w[35], w[34], selector); + w[52] = __byte_perm (w[34], w[33], selector); + w[51] = __byte_perm (w[33], w[32], selector); + w[50] = __byte_perm (w[32], w[31], selector); + w[49] = __byte_perm (w[31], w[30], selector); + w[48] = __byte_perm (w[30], w[29], selector); + w[47] = __byte_perm (w[29], w[28], selector); + w[46] = __byte_perm (w[28], w[27], selector); + w[45] = __byte_perm (w[27], w[26], selector); + w[44] = __byte_perm (w[26], w[25], selector); + w[43] = __byte_perm (w[25], w[24], selector); + w[42] = __byte_perm (w[24], w[23], selector); + w[41] = __byte_perm (w[23], w[22], selector); + w[40] = __byte_perm (w[22], w[21], selector); + w[39] = __byte_perm (w[21], w[20], selector); + w[38] = __byte_perm (w[20], w[19], selector); + w[37] = __byte_perm (w[19], w[18], selector); + w[36] = __byte_perm (w[18], w[17], selector); + w[35] = __byte_perm (w[17], w[16], selector); + w[34] = __byte_perm (w[16], w[15], selector); + w[33] = __byte_perm (w[15], w[14], selector); + w[32] = __byte_perm (w[14], w[13], selector); + w[31] = __byte_perm (w[13], w[12], selector); + w[30] = __byte_perm (w[12], w[11], selector); + w[29] = __byte_perm (w[11], w[10], selector); + w[28] = __byte_perm (w[10], w[ 9], selector); + w[27] = __byte_perm (w[ 9], w[ 8], selector); + w[26] = __byte_perm (w[ 8], w[ 7], selector); + w[25] = __byte_perm (w[ 7], w[ 6], selector); + w[24] = __byte_perm (w[ 6], w[ 5], selector); + w[23] = __byte_perm (w[ 5], w[ 4], selector); + w[22] = __byte_perm (w[ 4], w[ 3], selector); + w[21] = __byte_perm (w[ 3], w[ 2], selector); + w[20] = __byte_perm (w[ 2], w[ 1], selector); + w[19] = __byte_perm (w[ 1], w[ 0], selector); + w[18] = __byte_perm (w[ 0], 0, selector); + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 19: + w[63] = __byte_perm (w[44], w[43], selector); + w[62] = __byte_perm (w[43], w[42], selector); + w[61] = __byte_perm (w[42], w[41], selector); + w[60] = __byte_perm (w[41], w[40], selector); + w[59] = __byte_perm (w[40], w[39], selector); + w[58] = __byte_perm (w[39], w[38], selector); + w[57] = __byte_perm (w[38], w[37], selector); + w[56] = __byte_perm (w[37], w[36], selector); + w[55] = __byte_perm (w[36], w[35], selector); + w[54] = __byte_perm (w[35], w[34], selector); + w[53] = __byte_perm (w[34], w[33], selector); + w[52] = __byte_perm (w[33], w[32], selector); + w[51] = __byte_perm (w[32], w[31], selector); + w[50] = __byte_perm (w[31], w[30], selector); + w[49] = __byte_perm (w[30], w[29], selector); + w[48] = __byte_perm (w[29], w[28], selector); + w[47] = __byte_perm (w[28], w[27], selector); + w[46] = __byte_perm (w[27], w[26], selector); + w[45] = __byte_perm (w[26], w[25], selector); + w[44] = __byte_perm (w[25], w[24], selector); + w[43] = __byte_perm (w[24], w[23], selector); + w[42] = __byte_perm (w[23], w[22], selector); + w[41] = __byte_perm (w[22], w[21], selector); + w[40] = __byte_perm (w[21], w[20], selector); + w[39] = __byte_perm (w[20], w[19], selector); + w[38] = __byte_perm (w[19], w[18], selector); + w[37] = __byte_perm (w[18], w[17], selector); + w[36] = __byte_perm (w[17], w[16], selector); + w[35] = __byte_perm (w[16], w[15], selector); + w[34] = __byte_perm (w[15], w[14], selector); + w[33] = __byte_perm (w[14], w[13], selector); + w[32] = __byte_perm (w[13], w[12], selector); + w[31] = __byte_perm (w[12], w[11], selector); + w[30] = __byte_perm (w[11], w[10], selector); + w[29] = __byte_perm (w[10], w[ 9], selector); + w[28] = __byte_perm (w[ 9], w[ 8], selector); + w[27] = __byte_perm (w[ 8], w[ 7], selector); + w[26] = __byte_perm (w[ 7], w[ 6], selector); + w[25] = __byte_perm (w[ 6], w[ 5], selector); + w[24] = __byte_perm (w[ 5], w[ 4], selector); + w[23] = __byte_perm (w[ 4], w[ 3], selector); + w[22] = __byte_perm (w[ 3], w[ 2], selector); + w[21] = __byte_perm (w[ 2], w[ 1], selector); + w[20] = __byte_perm (w[ 1], w[ 0], selector); + w[19] = __byte_perm (w[ 0], 0, selector); + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 20: + w[63] = __byte_perm (w[43], w[42], selector); + w[62] = __byte_perm (w[42], w[41], selector); + w[61] = __byte_perm (w[41], w[40], selector); + w[60] = __byte_perm (w[40], w[39], selector); + w[59] = __byte_perm (w[39], w[38], selector); + w[58] = __byte_perm (w[38], w[37], selector); + w[57] = __byte_perm (w[37], w[36], selector); + w[56] = __byte_perm (w[36], w[35], selector); + w[55] = __byte_perm (w[35], w[34], selector); + w[54] = __byte_perm (w[34], w[33], selector); + w[53] = __byte_perm (w[33], w[32], selector); + w[52] = __byte_perm (w[32], w[31], selector); + w[51] = __byte_perm (w[31], w[30], selector); + w[50] = __byte_perm (w[30], w[29], selector); + w[49] = __byte_perm (w[29], w[28], selector); + w[48] = __byte_perm (w[28], w[27], selector); + w[47] = __byte_perm (w[27], w[26], selector); + w[46] = __byte_perm (w[26], w[25], selector); + w[45] = __byte_perm (w[25], w[24], selector); + w[44] = __byte_perm (w[24], w[23], selector); + w[43] = __byte_perm (w[23], w[22], selector); + w[42] = __byte_perm (w[22], w[21], selector); + w[41] = __byte_perm (w[21], w[20], selector); + w[40] = __byte_perm (w[20], w[19], selector); + w[39] = __byte_perm (w[19], w[18], selector); + w[38] = __byte_perm (w[18], w[17], selector); + w[37] = __byte_perm (w[17], w[16], selector); + w[36] = __byte_perm (w[16], w[15], selector); + w[35] = __byte_perm (w[15], w[14], selector); + w[34] = __byte_perm (w[14], w[13], selector); + w[33] = __byte_perm (w[13], w[12], selector); + w[32] = __byte_perm (w[12], w[11], selector); + w[31] = __byte_perm (w[11], w[10], selector); + w[30] = __byte_perm (w[10], w[ 9], selector); + w[29] = __byte_perm (w[ 9], w[ 8], selector); + w[28] = __byte_perm (w[ 8], w[ 7], selector); + w[27] = __byte_perm (w[ 7], w[ 6], selector); + w[26] = __byte_perm (w[ 6], w[ 5], selector); + w[25] = __byte_perm (w[ 5], w[ 4], selector); + w[24] = __byte_perm (w[ 4], w[ 3], selector); + w[23] = __byte_perm (w[ 3], w[ 2], selector); + w[22] = __byte_perm (w[ 2], w[ 1], selector); + w[21] = __byte_perm (w[ 1], w[ 0], selector); + w[20] = __byte_perm (w[ 0], 0, selector); + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 21: + w[63] = __byte_perm (w[42], w[41], selector); + w[62] = __byte_perm (w[41], w[40], selector); + w[61] = __byte_perm (w[40], w[39], selector); + w[60] = __byte_perm (w[39], w[38], selector); + w[59] = __byte_perm (w[38], w[37], selector); + w[58] = __byte_perm (w[37], w[36], selector); + w[57] = __byte_perm (w[36], w[35], selector); + w[56] = __byte_perm (w[35], w[34], selector); + w[55] = __byte_perm (w[34], w[33], selector); + w[54] = __byte_perm (w[33], w[32], selector); + w[53] = __byte_perm (w[32], w[31], selector); + w[52] = __byte_perm (w[31], w[30], selector); + w[51] = __byte_perm (w[30], w[29], selector); + w[50] = __byte_perm (w[29], w[28], selector); + w[49] = __byte_perm (w[28], w[27], selector); + w[48] = __byte_perm (w[27], w[26], selector); + w[47] = __byte_perm (w[26], w[25], selector); + w[46] = __byte_perm (w[25], w[24], selector); + w[45] = __byte_perm (w[24], w[23], selector); + w[44] = __byte_perm (w[23], w[22], selector); + w[43] = __byte_perm (w[22], w[21], selector); + w[42] = __byte_perm (w[21], w[20], selector); + w[41] = __byte_perm (w[20], w[19], selector); + w[40] = __byte_perm (w[19], w[18], selector); + w[39] = __byte_perm (w[18], w[17], selector); + w[38] = __byte_perm (w[17], w[16], selector); + w[37] = __byte_perm (w[16], w[15], selector); + w[36] = __byte_perm (w[15], w[14], selector); + w[35] = __byte_perm (w[14], w[13], selector); + w[34] = __byte_perm (w[13], w[12], selector); + w[33] = __byte_perm (w[12], w[11], selector); + w[32] = __byte_perm (w[11], w[10], selector); + w[31] = __byte_perm (w[10], w[ 9], selector); + w[30] = __byte_perm (w[ 9], w[ 8], selector); + w[29] = __byte_perm (w[ 8], w[ 7], selector); + w[28] = __byte_perm (w[ 7], w[ 6], selector); + w[27] = __byte_perm (w[ 6], w[ 5], selector); + w[26] = __byte_perm (w[ 5], w[ 4], selector); + w[25] = __byte_perm (w[ 4], w[ 3], selector); + w[24] = __byte_perm (w[ 3], w[ 2], selector); + w[23] = __byte_perm (w[ 2], w[ 1], selector); + w[22] = __byte_perm (w[ 1], w[ 0], selector); + w[21] = __byte_perm (w[ 0], 0, selector); + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 22: + w[63] = __byte_perm (w[41], w[40], selector); + w[62] = __byte_perm (w[40], w[39], selector); + w[61] = __byte_perm (w[39], w[38], selector); + w[60] = __byte_perm (w[38], w[37], selector); + w[59] = __byte_perm (w[37], w[36], selector); + w[58] = __byte_perm (w[36], w[35], selector); + w[57] = __byte_perm (w[35], w[34], selector); + w[56] = __byte_perm (w[34], w[33], selector); + w[55] = __byte_perm (w[33], w[32], selector); + w[54] = __byte_perm (w[32], w[31], selector); + w[53] = __byte_perm (w[31], w[30], selector); + w[52] = __byte_perm (w[30], w[29], selector); + w[51] = __byte_perm (w[29], w[28], selector); + w[50] = __byte_perm (w[28], w[27], selector); + w[49] = __byte_perm (w[27], w[26], selector); + w[48] = __byte_perm (w[26], w[25], selector); + w[47] = __byte_perm (w[25], w[24], selector); + w[46] = __byte_perm (w[24], w[23], selector); + w[45] = __byte_perm (w[23], w[22], selector); + w[44] = __byte_perm (w[22], w[21], selector); + w[43] = __byte_perm (w[21], w[20], selector); + w[42] = __byte_perm (w[20], w[19], selector); + w[41] = __byte_perm (w[19], w[18], selector); + w[40] = __byte_perm (w[18], w[17], selector); + w[39] = __byte_perm (w[17], w[16], selector); + w[38] = __byte_perm (w[16], w[15], selector); + w[37] = __byte_perm (w[15], w[14], selector); + w[36] = __byte_perm (w[14], w[13], selector); + w[35] = __byte_perm (w[13], w[12], selector); + w[34] = __byte_perm (w[12], w[11], selector); + w[33] = __byte_perm (w[11], w[10], selector); + w[32] = __byte_perm (w[10], w[ 9], selector); + w[31] = __byte_perm (w[ 9], w[ 8], selector); + w[30] = __byte_perm (w[ 8], w[ 7], selector); + w[29] = __byte_perm (w[ 7], w[ 6], selector); + w[28] = __byte_perm (w[ 6], w[ 5], selector); + w[27] = __byte_perm (w[ 5], w[ 4], selector); + w[26] = __byte_perm (w[ 4], w[ 3], selector); + w[25] = __byte_perm (w[ 3], w[ 2], selector); + w[24] = __byte_perm (w[ 2], w[ 1], selector); + w[23] = __byte_perm (w[ 1], w[ 0], selector); + w[22] = __byte_perm (w[ 0], 0, selector); + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 23: + w[63] = __byte_perm (w[40], w[39], selector); + w[62] = __byte_perm (w[39], w[38], selector); + w[61] = __byte_perm (w[38], w[37], selector); + w[60] = __byte_perm (w[37], w[36], selector); + w[59] = __byte_perm (w[36], w[35], selector); + w[58] = __byte_perm (w[35], w[34], selector); + w[57] = __byte_perm (w[34], w[33], selector); + w[56] = __byte_perm (w[33], w[32], selector); + w[55] = __byte_perm (w[32], w[31], selector); + w[54] = __byte_perm (w[31], w[30], selector); + w[53] = __byte_perm (w[30], w[29], selector); + w[52] = __byte_perm (w[29], w[28], selector); + w[51] = __byte_perm (w[28], w[27], selector); + w[50] = __byte_perm (w[27], w[26], selector); + w[49] = __byte_perm (w[26], w[25], selector); + w[48] = __byte_perm (w[25], w[24], selector); + w[47] = __byte_perm (w[24], w[23], selector); + w[46] = __byte_perm (w[23], w[22], selector); + w[45] = __byte_perm (w[22], w[21], selector); + w[44] = __byte_perm (w[21], w[20], selector); + w[43] = __byte_perm (w[20], w[19], selector); + w[42] = __byte_perm (w[19], w[18], selector); + w[41] = __byte_perm (w[18], w[17], selector); + w[40] = __byte_perm (w[17], w[16], selector); + w[39] = __byte_perm (w[16], w[15], selector); + w[38] = __byte_perm (w[15], w[14], selector); + w[37] = __byte_perm (w[14], w[13], selector); + w[36] = __byte_perm (w[13], w[12], selector); + w[35] = __byte_perm (w[12], w[11], selector); + w[34] = __byte_perm (w[11], w[10], selector); + w[33] = __byte_perm (w[10], w[ 9], selector); + w[32] = __byte_perm (w[ 9], w[ 8], selector); + w[31] = __byte_perm (w[ 8], w[ 7], selector); + w[30] = __byte_perm (w[ 7], w[ 6], selector); + w[29] = __byte_perm (w[ 6], w[ 5], selector); + w[28] = __byte_perm (w[ 5], w[ 4], selector); + w[27] = __byte_perm (w[ 4], w[ 3], selector); + w[26] = __byte_perm (w[ 3], w[ 2], selector); + w[25] = __byte_perm (w[ 2], w[ 1], selector); + w[24] = __byte_perm (w[ 1], w[ 0], selector); + w[23] = __byte_perm (w[ 0], 0, selector); + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 24: + w[63] = __byte_perm (w[39], w[38], selector); + w[62] = __byte_perm (w[38], w[37], selector); + w[61] = __byte_perm (w[37], w[36], selector); + w[60] = __byte_perm (w[36], w[35], selector); + w[59] = __byte_perm (w[35], w[34], selector); + w[58] = __byte_perm (w[34], w[33], selector); + w[57] = __byte_perm (w[33], w[32], selector); + w[56] = __byte_perm (w[32], w[31], selector); + w[55] = __byte_perm (w[31], w[30], selector); + w[54] = __byte_perm (w[30], w[29], selector); + w[53] = __byte_perm (w[29], w[28], selector); + w[52] = __byte_perm (w[28], w[27], selector); + w[51] = __byte_perm (w[27], w[26], selector); + w[50] = __byte_perm (w[26], w[25], selector); + w[49] = __byte_perm (w[25], w[24], selector); + w[48] = __byte_perm (w[24], w[23], selector); + w[47] = __byte_perm (w[23], w[22], selector); + w[46] = __byte_perm (w[22], w[21], selector); + w[45] = __byte_perm (w[21], w[20], selector); + w[44] = __byte_perm (w[20], w[19], selector); + w[43] = __byte_perm (w[19], w[18], selector); + w[42] = __byte_perm (w[18], w[17], selector); + w[41] = __byte_perm (w[17], w[16], selector); + w[40] = __byte_perm (w[16], w[15], selector); + w[39] = __byte_perm (w[15], w[14], selector); + w[38] = __byte_perm (w[14], w[13], selector); + w[37] = __byte_perm (w[13], w[12], selector); + w[36] = __byte_perm (w[12], w[11], selector); + w[35] = __byte_perm (w[11], w[10], selector); + w[34] = __byte_perm (w[10], w[ 9], selector); + w[33] = __byte_perm (w[ 9], w[ 8], selector); + w[32] = __byte_perm (w[ 8], w[ 7], selector); + w[31] = __byte_perm (w[ 7], w[ 6], selector); + w[30] = __byte_perm (w[ 6], w[ 5], selector); + w[29] = __byte_perm (w[ 5], w[ 4], selector); + w[28] = __byte_perm (w[ 4], w[ 3], selector); + w[27] = __byte_perm (w[ 3], w[ 2], selector); + w[26] = __byte_perm (w[ 2], w[ 1], selector); + w[25] = __byte_perm (w[ 1], w[ 0], selector); + w[24] = __byte_perm (w[ 0], 0, selector); + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 25: + w[63] = __byte_perm (w[38], w[37], selector); + w[62] = __byte_perm (w[37], w[36], selector); + w[61] = __byte_perm (w[36], w[35], selector); + w[60] = __byte_perm (w[35], w[34], selector); + w[59] = __byte_perm (w[34], w[33], selector); + w[58] = __byte_perm (w[33], w[32], selector); + w[57] = __byte_perm (w[32], w[31], selector); + w[56] = __byte_perm (w[31], w[30], selector); + w[55] = __byte_perm (w[30], w[29], selector); + w[54] = __byte_perm (w[29], w[28], selector); + w[53] = __byte_perm (w[28], w[27], selector); + w[52] = __byte_perm (w[27], w[26], selector); + w[51] = __byte_perm (w[26], w[25], selector); + w[50] = __byte_perm (w[25], w[24], selector); + w[49] = __byte_perm (w[24], w[23], selector); + w[48] = __byte_perm (w[23], w[22], selector); + w[47] = __byte_perm (w[22], w[21], selector); + w[46] = __byte_perm (w[21], w[20], selector); + w[45] = __byte_perm (w[20], w[19], selector); + w[44] = __byte_perm (w[19], w[18], selector); + w[43] = __byte_perm (w[18], w[17], selector); + w[42] = __byte_perm (w[17], w[16], selector); + w[41] = __byte_perm (w[16], w[15], selector); + w[40] = __byte_perm (w[15], w[14], selector); + w[39] = __byte_perm (w[14], w[13], selector); + w[38] = __byte_perm (w[13], w[12], selector); + w[37] = __byte_perm (w[12], w[11], selector); + w[36] = __byte_perm (w[11], w[10], selector); + w[35] = __byte_perm (w[10], w[ 9], selector); + w[34] = __byte_perm (w[ 9], w[ 8], selector); + w[33] = __byte_perm (w[ 8], w[ 7], selector); + w[32] = __byte_perm (w[ 7], w[ 6], selector); + w[31] = __byte_perm (w[ 6], w[ 5], selector); + w[30] = __byte_perm (w[ 5], w[ 4], selector); + w[29] = __byte_perm (w[ 4], w[ 3], selector); + w[28] = __byte_perm (w[ 3], w[ 2], selector); + w[27] = __byte_perm (w[ 2], w[ 1], selector); + w[26] = __byte_perm (w[ 1], w[ 0], selector); + w[25] = __byte_perm (w[ 0], 0, selector); + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 26: + w[63] = __byte_perm (w[37], w[36], selector); + w[62] = __byte_perm (w[36], w[35], selector); + w[61] = __byte_perm (w[35], w[34], selector); + w[60] = __byte_perm (w[34], w[33], selector); + w[59] = __byte_perm (w[33], w[32], selector); + w[58] = __byte_perm (w[32], w[31], selector); + w[57] = __byte_perm (w[31], w[30], selector); + w[56] = __byte_perm (w[30], w[29], selector); + w[55] = __byte_perm (w[29], w[28], selector); + w[54] = __byte_perm (w[28], w[27], selector); + w[53] = __byte_perm (w[27], w[26], selector); + w[52] = __byte_perm (w[26], w[25], selector); + w[51] = __byte_perm (w[25], w[24], selector); + w[50] = __byte_perm (w[24], w[23], selector); + w[49] = __byte_perm (w[23], w[22], selector); + w[48] = __byte_perm (w[22], w[21], selector); + w[47] = __byte_perm (w[21], w[20], selector); + w[46] = __byte_perm (w[20], w[19], selector); + w[45] = __byte_perm (w[19], w[18], selector); + w[44] = __byte_perm (w[18], w[17], selector); + w[43] = __byte_perm (w[17], w[16], selector); + w[42] = __byte_perm (w[16], w[15], selector); + w[41] = __byte_perm (w[15], w[14], selector); + w[40] = __byte_perm (w[14], w[13], selector); + w[39] = __byte_perm (w[13], w[12], selector); + w[38] = __byte_perm (w[12], w[11], selector); + w[37] = __byte_perm (w[11], w[10], selector); + w[36] = __byte_perm (w[10], w[ 9], selector); + w[35] = __byte_perm (w[ 9], w[ 8], selector); + w[34] = __byte_perm (w[ 8], w[ 7], selector); + w[33] = __byte_perm (w[ 7], w[ 6], selector); + w[32] = __byte_perm (w[ 6], w[ 5], selector); + w[31] = __byte_perm (w[ 5], w[ 4], selector); + w[30] = __byte_perm (w[ 4], w[ 3], selector); + w[29] = __byte_perm (w[ 3], w[ 2], selector); + w[28] = __byte_perm (w[ 2], w[ 1], selector); + w[27] = __byte_perm (w[ 1], w[ 0], selector); + w[26] = __byte_perm (w[ 0], 0, selector); + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 27: + w[63] = __byte_perm (w[36], w[35], selector); + w[62] = __byte_perm (w[35], w[34], selector); + w[61] = __byte_perm (w[34], w[33], selector); + w[60] = __byte_perm (w[33], w[32], selector); + w[59] = __byte_perm (w[32], w[31], selector); + w[58] = __byte_perm (w[31], w[30], selector); + w[57] = __byte_perm (w[30], w[29], selector); + w[56] = __byte_perm (w[29], w[28], selector); + w[55] = __byte_perm (w[28], w[27], selector); + w[54] = __byte_perm (w[27], w[26], selector); + w[53] = __byte_perm (w[26], w[25], selector); + w[52] = __byte_perm (w[25], w[24], selector); + w[51] = __byte_perm (w[24], w[23], selector); + w[50] = __byte_perm (w[23], w[22], selector); + w[49] = __byte_perm (w[22], w[21], selector); + w[48] = __byte_perm (w[21], w[20], selector); + w[47] = __byte_perm (w[20], w[19], selector); + w[46] = __byte_perm (w[19], w[18], selector); + w[45] = __byte_perm (w[18], w[17], selector); + w[44] = __byte_perm (w[17], w[16], selector); + w[43] = __byte_perm (w[16], w[15], selector); + w[42] = __byte_perm (w[15], w[14], selector); + w[41] = __byte_perm (w[14], w[13], selector); + w[40] = __byte_perm (w[13], w[12], selector); + w[39] = __byte_perm (w[12], w[11], selector); + w[38] = __byte_perm (w[11], w[10], selector); + w[37] = __byte_perm (w[10], w[ 9], selector); + w[36] = __byte_perm (w[ 9], w[ 8], selector); + w[35] = __byte_perm (w[ 8], w[ 7], selector); + w[34] = __byte_perm (w[ 7], w[ 6], selector); + w[33] = __byte_perm (w[ 6], w[ 5], selector); + w[32] = __byte_perm (w[ 5], w[ 4], selector); + w[31] = __byte_perm (w[ 4], w[ 3], selector); + w[30] = __byte_perm (w[ 3], w[ 2], selector); + w[29] = __byte_perm (w[ 2], w[ 1], selector); + w[28] = __byte_perm (w[ 1], w[ 0], selector); + w[27] = __byte_perm (w[ 0], 0, selector); + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 28: + w[63] = __byte_perm (w[35], w[34], selector); + w[62] = __byte_perm (w[34], w[33], selector); + w[61] = __byte_perm (w[33], w[32], selector); + w[60] = __byte_perm (w[32], w[31], selector); + w[59] = __byte_perm (w[31], w[30], selector); + w[58] = __byte_perm (w[30], w[29], selector); + w[57] = __byte_perm (w[29], w[28], selector); + w[56] = __byte_perm (w[28], w[27], selector); + w[55] = __byte_perm (w[27], w[26], selector); + w[54] = __byte_perm (w[26], w[25], selector); + w[53] = __byte_perm (w[25], w[24], selector); + w[52] = __byte_perm (w[24], w[23], selector); + w[51] = __byte_perm (w[23], w[22], selector); + w[50] = __byte_perm (w[22], w[21], selector); + w[49] = __byte_perm (w[21], w[20], selector); + w[48] = __byte_perm (w[20], w[19], selector); + w[47] = __byte_perm (w[19], w[18], selector); + w[46] = __byte_perm (w[18], w[17], selector); + w[45] = __byte_perm (w[17], w[16], selector); + w[44] = __byte_perm (w[16], w[15], selector); + w[43] = __byte_perm (w[15], w[14], selector); + w[42] = __byte_perm (w[14], w[13], selector); + w[41] = __byte_perm (w[13], w[12], selector); + w[40] = __byte_perm (w[12], w[11], selector); + w[39] = __byte_perm (w[11], w[10], selector); + w[38] = __byte_perm (w[10], w[ 9], selector); + w[37] = __byte_perm (w[ 9], w[ 8], selector); + w[36] = __byte_perm (w[ 8], w[ 7], selector); + w[35] = __byte_perm (w[ 7], w[ 6], selector); + w[34] = __byte_perm (w[ 6], w[ 5], selector); + w[33] = __byte_perm (w[ 5], w[ 4], selector); + w[32] = __byte_perm (w[ 4], w[ 3], selector); + w[31] = __byte_perm (w[ 3], w[ 2], selector); + w[30] = __byte_perm (w[ 2], w[ 1], selector); + w[29] = __byte_perm (w[ 1], w[ 0], selector); + w[28] = __byte_perm (w[ 0], 0, selector); + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 29: + w[63] = __byte_perm (w[34], w[33], selector); + w[62] = __byte_perm (w[33], w[32], selector); + w[61] = __byte_perm (w[32], w[31], selector); + w[60] = __byte_perm (w[31], w[30], selector); + w[59] = __byte_perm (w[30], w[29], selector); + w[58] = __byte_perm (w[29], w[28], selector); + w[57] = __byte_perm (w[28], w[27], selector); + w[56] = __byte_perm (w[27], w[26], selector); + w[55] = __byte_perm (w[26], w[25], selector); + w[54] = __byte_perm (w[25], w[24], selector); + w[53] = __byte_perm (w[24], w[23], selector); + w[52] = __byte_perm (w[23], w[22], selector); + w[51] = __byte_perm (w[22], w[21], selector); + w[50] = __byte_perm (w[21], w[20], selector); + w[49] = __byte_perm (w[20], w[19], selector); + w[48] = __byte_perm (w[19], w[18], selector); + w[47] = __byte_perm (w[18], w[17], selector); + w[46] = __byte_perm (w[17], w[16], selector); + w[45] = __byte_perm (w[16], w[15], selector); + w[44] = __byte_perm (w[15], w[14], selector); + w[43] = __byte_perm (w[14], w[13], selector); + w[42] = __byte_perm (w[13], w[12], selector); + w[41] = __byte_perm (w[12], w[11], selector); + w[40] = __byte_perm (w[11], w[10], selector); + w[39] = __byte_perm (w[10], w[ 9], selector); + w[38] = __byte_perm (w[ 9], w[ 8], selector); + w[37] = __byte_perm (w[ 8], w[ 7], selector); + w[36] = __byte_perm (w[ 7], w[ 6], selector); + w[35] = __byte_perm (w[ 6], w[ 5], selector); + w[34] = __byte_perm (w[ 5], w[ 4], selector); + w[33] = __byte_perm (w[ 4], w[ 3], selector); + w[32] = __byte_perm (w[ 3], w[ 2], selector); + w[31] = __byte_perm (w[ 2], w[ 1], selector); + w[30] = __byte_perm (w[ 1], w[ 0], selector); + w[29] = __byte_perm (w[ 0], 0, selector); + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 30: + w[63] = __byte_perm (w[33], w[32], selector); + w[62] = __byte_perm (w[32], w[31], selector); + w[61] = __byte_perm (w[31], w[30], selector); + w[60] = __byte_perm (w[30], w[29], selector); + w[59] = __byte_perm (w[29], w[28], selector); + w[58] = __byte_perm (w[28], w[27], selector); + w[57] = __byte_perm (w[27], w[26], selector); + w[56] = __byte_perm (w[26], w[25], selector); + w[55] = __byte_perm (w[25], w[24], selector); + w[54] = __byte_perm (w[24], w[23], selector); + w[53] = __byte_perm (w[23], w[22], selector); + w[52] = __byte_perm (w[22], w[21], selector); + w[51] = __byte_perm (w[21], w[20], selector); + w[50] = __byte_perm (w[20], w[19], selector); + w[49] = __byte_perm (w[19], w[18], selector); + w[48] = __byte_perm (w[18], w[17], selector); + w[47] = __byte_perm (w[17], w[16], selector); + w[46] = __byte_perm (w[16], w[15], selector); + w[45] = __byte_perm (w[15], w[14], selector); + w[44] = __byte_perm (w[14], w[13], selector); + w[43] = __byte_perm (w[13], w[12], selector); + w[42] = __byte_perm (w[12], w[11], selector); + w[41] = __byte_perm (w[11], w[10], selector); + w[40] = __byte_perm (w[10], w[ 9], selector); + w[39] = __byte_perm (w[ 9], w[ 8], selector); + w[38] = __byte_perm (w[ 8], w[ 7], selector); + w[37] = __byte_perm (w[ 7], w[ 6], selector); + w[36] = __byte_perm (w[ 6], w[ 5], selector); + w[35] = __byte_perm (w[ 5], w[ 4], selector); + w[34] = __byte_perm (w[ 4], w[ 3], selector); + w[33] = __byte_perm (w[ 3], w[ 2], selector); + w[32] = __byte_perm (w[ 2], w[ 1], selector); + w[31] = __byte_perm (w[ 1], w[ 0], selector); + w[30] = __byte_perm (w[ 0], 0, selector); + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 31: + w[63] = __byte_perm (w[32], w[31], selector); + w[62] = __byte_perm (w[31], w[30], selector); + w[61] = __byte_perm (w[30], w[29], selector); + w[60] = __byte_perm (w[29], w[28], selector); + w[59] = __byte_perm (w[28], w[27], selector); + w[58] = __byte_perm (w[27], w[26], selector); + w[57] = __byte_perm (w[26], w[25], selector); + w[56] = __byte_perm (w[25], w[24], selector); + w[55] = __byte_perm (w[24], w[23], selector); + w[54] = __byte_perm (w[23], w[22], selector); + w[53] = __byte_perm (w[22], w[21], selector); + w[52] = __byte_perm (w[21], w[20], selector); + w[51] = __byte_perm (w[20], w[19], selector); + w[50] = __byte_perm (w[19], w[18], selector); + w[49] = __byte_perm (w[18], w[17], selector); + w[48] = __byte_perm (w[17], w[16], selector); + w[47] = __byte_perm (w[16], w[15], selector); + w[46] = __byte_perm (w[15], w[14], selector); + w[45] = __byte_perm (w[14], w[13], selector); + w[44] = __byte_perm (w[13], w[12], selector); + w[43] = __byte_perm (w[12], w[11], selector); + w[42] = __byte_perm (w[11], w[10], selector); + w[41] = __byte_perm (w[10], w[ 9], selector); + w[40] = __byte_perm (w[ 9], w[ 8], selector); + w[39] = __byte_perm (w[ 8], w[ 7], selector); + w[38] = __byte_perm (w[ 7], w[ 6], selector); + w[37] = __byte_perm (w[ 6], w[ 5], selector); + w[36] = __byte_perm (w[ 5], w[ 4], selector); + w[35] = __byte_perm (w[ 4], w[ 3], selector); + w[34] = __byte_perm (w[ 3], w[ 2], selector); + w[33] = __byte_perm (w[ 2], w[ 1], selector); + w[32] = __byte_perm (w[ 1], w[ 0], selector); + w[31] = __byte_perm (w[ 0], 0, selector); + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 32: + w[63] = __byte_perm (w[31], w[30], selector); + w[62] = __byte_perm (w[30], w[29], selector); + w[61] = __byte_perm (w[29], w[28], selector); + w[60] = __byte_perm (w[28], w[27], selector); + w[59] = __byte_perm (w[27], w[26], selector); + w[58] = __byte_perm (w[26], w[25], selector); + w[57] = __byte_perm (w[25], w[24], selector); + w[56] = __byte_perm (w[24], w[23], selector); + w[55] = __byte_perm (w[23], w[22], selector); + w[54] = __byte_perm (w[22], w[21], selector); + w[53] = __byte_perm (w[21], w[20], selector); + w[52] = __byte_perm (w[20], w[19], selector); + w[51] = __byte_perm (w[19], w[18], selector); + w[50] = __byte_perm (w[18], w[17], selector); + w[49] = __byte_perm (w[17], w[16], selector); + w[48] = __byte_perm (w[16], w[15], selector); + w[47] = __byte_perm (w[15], w[14], selector); + w[46] = __byte_perm (w[14], w[13], selector); + w[45] = __byte_perm (w[13], w[12], selector); + w[44] = __byte_perm (w[12], w[11], selector); + w[43] = __byte_perm (w[11], w[10], selector); + w[42] = __byte_perm (w[10], w[ 9], selector); + w[41] = __byte_perm (w[ 9], w[ 8], selector); + w[40] = __byte_perm (w[ 8], w[ 7], selector); + w[39] = __byte_perm (w[ 7], w[ 6], selector); + w[38] = __byte_perm (w[ 6], w[ 5], selector); + w[37] = __byte_perm (w[ 5], w[ 4], selector); + w[36] = __byte_perm (w[ 4], w[ 3], selector); + w[35] = __byte_perm (w[ 3], w[ 2], selector); + w[34] = __byte_perm (w[ 2], w[ 1], selector); + w[33] = __byte_perm (w[ 1], w[ 0], selector); + w[32] = __byte_perm (w[ 0], 0, selector); + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 33: + w[63] = __byte_perm (w[30], w[29], selector); + w[62] = __byte_perm (w[29], w[28], selector); + w[61] = __byte_perm (w[28], w[27], selector); + w[60] = __byte_perm (w[27], w[26], selector); + w[59] = __byte_perm (w[26], w[25], selector); + w[58] = __byte_perm (w[25], w[24], selector); + w[57] = __byte_perm (w[24], w[23], selector); + w[56] = __byte_perm (w[23], w[22], selector); + w[55] = __byte_perm (w[22], w[21], selector); + w[54] = __byte_perm (w[21], w[20], selector); + w[53] = __byte_perm (w[20], w[19], selector); + w[52] = __byte_perm (w[19], w[18], selector); + w[51] = __byte_perm (w[18], w[17], selector); + w[50] = __byte_perm (w[17], w[16], selector); + w[49] = __byte_perm (w[16], w[15], selector); + w[48] = __byte_perm (w[15], w[14], selector); + w[47] = __byte_perm (w[14], w[13], selector); + w[46] = __byte_perm (w[13], w[12], selector); + w[45] = __byte_perm (w[12], w[11], selector); + w[44] = __byte_perm (w[11], w[10], selector); + w[43] = __byte_perm (w[10], w[ 9], selector); + w[42] = __byte_perm (w[ 9], w[ 8], selector); + w[41] = __byte_perm (w[ 8], w[ 7], selector); + w[40] = __byte_perm (w[ 7], w[ 6], selector); + w[39] = __byte_perm (w[ 6], w[ 5], selector); + w[38] = __byte_perm (w[ 5], w[ 4], selector); + w[37] = __byte_perm (w[ 4], w[ 3], selector); + w[36] = __byte_perm (w[ 3], w[ 2], selector); + w[35] = __byte_perm (w[ 2], w[ 1], selector); + w[34] = __byte_perm (w[ 1], w[ 0], selector); + w[33] = __byte_perm (w[ 0], 0, selector); + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 34: + w[63] = __byte_perm (w[29], w[28], selector); + w[62] = __byte_perm (w[28], w[27], selector); + w[61] = __byte_perm (w[27], w[26], selector); + w[60] = __byte_perm (w[26], w[25], selector); + w[59] = __byte_perm (w[25], w[24], selector); + w[58] = __byte_perm (w[24], w[23], selector); + w[57] = __byte_perm (w[23], w[22], selector); + w[56] = __byte_perm (w[22], w[21], selector); + w[55] = __byte_perm (w[21], w[20], selector); + w[54] = __byte_perm (w[20], w[19], selector); + w[53] = __byte_perm (w[19], w[18], selector); + w[52] = __byte_perm (w[18], w[17], selector); + w[51] = __byte_perm (w[17], w[16], selector); + w[50] = __byte_perm (w[16], w[15], selector); + w[49] = __byte_perm (w[15], w[14], selector); + w[48] = __byte_perm (w[14], w[13], selector); + w[47] = __byte_perm (w[13], w[12], selector); + w[46] = __byte_perm (w[12], w[11], selector); + w[45] = __byte_perm (w[11], w[10], selector); + w[44] = __byte_perm (w[10], w[ 9], selector); + w[43] = __byte_perm (w[ 9], w[ 8], selector); + w[42] = __byte_perm (w[ 8], w[ 7], selector); + w[41] = __byte_perm (w[ 7], w[ 6], selector); + w[40] = __byte_perm (w[ 6], w[ 5], selector); + w[39] = __byte_perm (w[ 5], w[ 4], selector); + w[38] = __byte_perm (w[ 4], w[ 3], selector); + w[37] = __byte_perm (w[ 3], w[ 2], selector); + w[36] = __byte_perm (w[ 2], w[ 1], selector); + w[35] = __byte_perm (w[ 1], w[ 0], selector); + w[34] = __byte_perm (w[ 0], 0, selector); + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 35: + w[63] = __byte_perm (w[28], w[27], selector); + w[62] = __byte_perm (w[27], w[26], selector); + w[61] = __byte_perm (w[26], w[25], selector); + w[60] = __byte_perm (w[25], w[24], selector); + w[59] = __byte_perm (w[24], w[23], selector); + w[58] = __byte_perm (w[23], w[22], selector); + w[57] = __byte_perm (w[22], w[21], selector); + w[56] = __byte_perm (w[21], w[20], selector); + w[55] = __byte_perm (w[20], w[19], selector); + w[54] = __byte_perm (w[19], w[18], selector); + w[53] = __byte_perm (w[18], w[17], selector); + w[52] = __byte_perm (w[17], w[16], selector); + w[51] = __byte_perm (w[16], w[15], selector); + w[50] = __byte_perm (w[15], w[14], selector); + w[49] = __byte_perm (w[14], w[13], selector); + w[48] = __byte_perm (w[13], w[12], selector); + w[47] = __byte_perm (w[12], w[11], selector); + w[46] = __byte_perm (w[11], w[10], selector); + w[45] = __byte_perm (w[10], w[ 9], selector); + w[44] = __byte_perm (w[ 9], w[ 8], selector); + w[43] = __byte_perm (w[ 8], w[ 7], selector); + w[42] = __byte_perm (w[ 7], w[ 6], selector); + w[41] = __byte_perm (w[ 6], w[ 5], selector); + w[40] = __byte_perm (w[ 5], w[ 4], selector); + w[39] = __byte_perm (w[ 4], w[ 3], selector); + w[38] = __byte_perm (w[ 3], w[ 2], selector); + w[37] = __byte_perm (w[ 2], w[ 1], selector); + w[36] = __byte_perm (w[ 1], w[ 0], selector); + w[35] = __byte_perm (w[ 0], 0, selector); + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 36: + w[63] = __byte_perm (w[27], w[26], selector); + w[62] = __byte_perm (w[26], w[25], selector); + w[61] = __byte_perm (w[25], w[24], selector); + w[60] = __byte_perm (w[24], w[23], selector); + w[59] = __byte_perm (w[23], w[22], selector); + w[58] = __byte_perm (w[22], w[21], selector); + w[57] = __byte_perm (w[21], w[20], selector); + w[56] = __byte_perm (w[20], w[19], selector); + w[55] = __byte_perm (w[19], w[18], selector); + w[54] = __byte_perm (w[18], w[17], selector); + w[53] = __byte_perm (w[17], w[16], selector); + w[52] = __byte_perm (w[16], w[15], selector); + w[51] = __byte_perm (w[15], w[14], selector); + w[50] = __byte_perm (w[14], w[13], selector); + w[49] = __byte_perm (w[13], w[12], selector); + w[48] = __byte_perm (w[12], w[11], selector); + w[47] = __byte_perm (w[11], w[10], selector); + w[46] = __byte_perm (w[10], w[ 9], selector); + w[45] = __byte_perm (w[ 9], w[ 8], selector); + w[44] = __byte_perm (w[ 8], w[ 7], selector); + w[43] = __byte_perm (w[ 7], w[ 6], selector); + w[42] = __byte_perm (w[ 6], w[ 5], selector); + w[41] = __byte_perm (w[ 5], w[ 4], selector); + w[40] = __byte_perm (w[ 4], w[ 3], selector); + w[39] = __byte_perm (w[ 3], w[ 2], selector); + w[38] = __byte_perm (w[ 2], w[ 1], selector); + w[37] = __byte_perm (w[ 1], w[ 0], selector); + w[36] = __byte_perm (w[ 0], 0, selector); + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 37: + w[63] = __byte_perm (w[26], w[25], selector); + w[62] = __byte_perm (w[25], w[24], selector); + w[61] = __byte_perm (w[24], w[23], selector); + w[60] = __byte_perm (w[23], w[22], selector); + w[59] = __byte_perm (w[22], w[21], selector); + w[58] = __byte_perm (w[21], w[20], selector); + w[57] = __byte_perm (w[20], w[19], selector); + w[56] = __byte_perm (w[19], w[18], selector); + w[55] = __byte_perm (w[18], w[17], selector); + w[54] = __byte_perm (w[17], w[16], selector); + w[53] = __byte_perm (w[16], w[15], selector); + w[52] = __byte_perm (w[15], w[14], selector); + w[51] = __byte_perm (w[14], w[13], selector); + w[50] = __byte_perm (w[13], w[12], selector); + w[49] = __byte_perm (w[12], w[11], selector); + w[48] = __byte_perm (w[11], w[10], selector); + w[47] = __byte_perm (w[10], w[ 9], selector); + w[46] = __byte_perm (w[ 9], w[ 8], selector); + w[45] = __byte_perm (w[ 8], w[ 7], selector); + w[44] = __byte_perm (w[ 7], w[ 6], selector); + w[43] = __byte_perm (w[ 6], w[ 5], selector); + w[42] = __byte_perm (w[ 5], w[ 4], selector); + w[41] = __byte_perm (w[ 4], w[ 3], selector); + w[40] = __byte_perm (w[ 3], w[ 2], selector); + w[39] = __byte_perm (w[ 2], w[ 1], selector); + w[38] = __byte_perm (w[ 1], w[ 0], selector); + w[37] = __byte_perm (w[ 0], 0, selector); + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 38: + w[63] = __byte_perm (w[25], w[24], selector); + w[62] = __byte_perm (w[24], w[23], selector); + w[61] = __byte_perm (w[23], w[22], selector); + w[60] = __byte_perm (w[22], w[21], selector); + w[59] = __byte_perm (w[21], w[20], selector); + w[58] = __byte_perm (w[20], w[19], selector); + w[57] = __byte_perm (w[19], w[18], selector); + w[56] = __byte_perm (w[18], w[17], selector); + w[55] = __byte_perm (w[17], w[16], selector); + w[54] = __byte_perm (w[16], w[15], selector); + w[53] = __byte_perm (w[15], w[14], selector); + w[52] = __byte_perm (w[14], w[13], selector); + w[51] = __byte_perm (w[13], w[12], selector); + w[50] = __byte_perm (w[12], w[11], selector); + w[49] = __byte_perm (w[11], w[10], selector); + w[48] = __byte_perm (w[10], w[ 9], selector); + w[47] = __byte_perm (w[ 9], w[ 8], selector); + w[46] = __byte_perm (w[ 8], w[ 7], selector); + w[45] = __byte_perm (w[ 7], w[ 6], selector); + w[44] = __byte_perm (w[ 6], w[ 5], selector); + w[43] = __byte_perm (w[ 5], w[ 4], selector); + w[42] = __byte_perm (w[ 4], w[ 3], selector); + w[41] = __byte_perm (w[ 3], w[ 2], selector); + w[40] = __byte_perm (w[ 2], w[ 1], selector); + w[39] = __byte_perm (w[ 1], w[ 0], selector); + w[38] = __byte_perm (w[ 0], 0, selector); + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 39: + w[63] = __byte_perm (w[24], w[23], selector); + w[62] = __byte_perm (w[23], w[22], selector); + w[61] = __byte_perm (w[22], w[21], selector); + w[60] = __byte_perm (w[21], w[20], selector); + w[59] = __byte_perm (w[20], w[19], selector); + w[58] = __byte_perm (w[19], w[18], selector); + w[57] = __byte_perm (w[18], w[17], selector); + w[56] = __byte_perm (w[17], w[16], selector); + w[55] = __byte_perm (w[16], w[15], selector); + w[54] = __byte_perm (w[15], w[14], selector); + w[53] = __byte_perm (w[14], w[13], selector); + w[52] = __byte_perm (w[13], w[12], selector); + w[51] = __byte_perm (w[12], w[11], selector); + w[50] = __byte_perm (w[11], w[10], selector); + w[49] = __byte_perm (w[10], w[ 9], selector); + w[48] = __byte_perm (w[ 9], w[ 8], selector); + w[47] = __byte_perm (w[ 8], w[ 7], selector); + w[46] = __byte_perm (w[ 7], w[ 6], selector); + w[45] = __byte_perm (w[ 6], w[ 5], selector); + w[44] = __byte_perm (w[ 5], w[ 4], selector); + w[43] = __byte_perm (w[ 4], w[ 3], selector); + w[42] = __byte_perm (w[ 3], w[ 2], selector); + w[41] = __byte_perm (w[ 2], w[ 1], selector); + w[40] = __byte_perm (w[ 1], w[ 0], selector); + w[39] = __byte_perm (w[ 0], 0, selector); + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 40: + w[63] = __byte_perm (w[23], w[22], selector); + w[62] = __byte_perm (w[22], w[21], selector); + w[61] = __byte_perm (w[21], w[20], selector); + w[60] = __byte_perm (w[20], w[19], selector); + w[59] = __byte_perm (w[19], w[18], selector); + w[58] = __byte_perm (w[18], w[17], selector); + w[57] = __byte_perm (w[17], w[16], selector); + w[56] = __byte_perm (w[16], w[15], selector); + w[55] = __byte_perm (w[15], w[14], selector); + w[54] = __byte_perm (w[14], w[13], selector); + w[53] = __byte_perm (w[13], w[12], selector); + w[52] = __byte_perm (w[12], w[11], selector); + w[51] = __byte_perm (w[11], w[10], selector); + w[50] = __byte_perm (w[10], w[ 9], selector); + w[49] = __byte_perm (w[ 9], w[ 8], selector); + w[48] = __byte_perm (w[ 8], w[ 7], selector); + w[47] = __byte_perm (w[ 7], w[ 6], selector); + w[46] = __byte_perm (w[ 6], w[ 5], selector); + w[45] = __byte_perm (w[ 5], w[ 4], selector); + w[44] = __byte_perm (w[ 4], w[ 3], selector); + w[43] = __byte_perm (w[ 3], w[ 2], selector); + w[42] = __byte_perm (w[ 2], w[ 1], selector); + w[41] = __byte_perm (w[ 1], w[ 0], selector); + w[40] = __byte_perm (w[ 0], 0, selector); + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 41: + w[63] = __byte_perm (w[22], w[21], selector); + w[62] = __byte_perm (w[21], w[20], selector); + w[61] = __byte_perm (w[20], w[19], selector); + w[60] = __byte_perm (w[19], w[18], selector); + w[59] = __byte_perm (w[18], w[17], selector); + w[58] = __byte_perm (w[17], w[16], selector); + w[57] = __byte_perm (w[16], w[15], selector); + w[56] = __byte_perm (w[15], w[14], selector); + w[55] = __byte_perm (w[14], w[13], selector); + w[54] = __byte_perm (w[13], w[12], selector); + w[53] = __byte_perm (w[12], w[11], selector); + w[52] = __byte_perm (w[11], w[10], selector); + w[51] = __byte_perm (w[10], w[ 9], selector); + w[50] = __byte_perm (w[ 9], w[ 8], selector); + w[49] = __byte_perm (w[ 8], w[ 7], selector); + w[48] = __byte_perm (w[ 7], w[ 6], selector); + w[47] = __byte_perm (w[ 6], w[ 5], selector); + w[46] = __byte_perm (w[ 5], w[ 4], selector); + w[45] = __byte_perm (w[ 4], w[ 3], selector); + w[44] = __byte_perm (w[ 3], w[ 2], selector); + w[43] = __byte_perm (w[ 2], w[ 1], selector); + w[42] = __byte_perm (w[ 1], w[ 0], selector); + w[41] = __byte_perm (w[ 0], 0, selector); + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 42: + w[63] = __byte_perm (w[21], w[20], selector); + w[62] = __byte_perm (w[20], w[19], selector); + w[61] = __byte_perm (w[19], w[18], selector); + w[60] = __byte_perm (w[18], w[17], selector); + w[59] = __byte_perm (w[17], w[16], selector); + w[58] = __byte_perm (w[16], w[15], selector); + w[57] = __byte_perm (w[15], w[14], selector); + w[56] = __byte_perm (w[14], w[13], selector); + w[55] = __byte_perm (w[13], w[12], selector); + w[54] = __byte_perm (w[12], w[11], selector); + w[53] = __byte_perm (w[11], w[10], selector); + w[52] = __byte_perm (w[10], w[ 9], selector); + w[51] = __byte_perm (w[ 9], w[ 8], selector); + w[50] = __byte_perm (w[ 8], w[ 7], selector); + w[49] = __byte_perm (w[ 7], w[ 6], selector); + w[48] = __byte_perm (w[ 6], w[ 5], selector); + w[47] = __byte_perm (w[ 5], w[ 4], selector); + w[46] = __byte_perm (w[ 4], w[ 3], selector); + w[45] = __byte_perm (w[ 3], w[ 2], selector); + w[44] = __byte_perm (w[ 2], w[ 1], selector); + w[43] = __byte_perm (w[ 1], w[ 0], selector); + w[42] = __byte_perm (w[ 0], 0, selector); + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 43: + w[63] = __byte_perm (w[20], w[19], selector); + w[62] = __byte_perm (w[19], w[18], selector); + w[61] = __byte_perm (w[18], w[17], selector); + w[60] = __byte_perm (w[17], w[16], selector); + w[59] = __byte_perm (w[16], w[15], selector); + w[58] = __byte_perm (w[15], w[14], selector); + w[57] = __byte_perm (w[14], w[13], selector); + w[56] = __byte_perm (w[13], w[12], selector); + w[55] = __byte_perm (w[12], w[11], selector); + w[54] = __byte_perm (w[11], w[10], selector); + w[53] = __byte_perm (w[10], w[ 9], selector); + w[52] = __byte_perm (w[ 9], w[ 8], selector); + w[51] = __byte_perm (w[ 8], w[ 7], selector); + w[50] = __byte_perm (w[ 7], w[ 6], selector); + w[49] = __byte_perm (w[ 6], w[ 5], selector); + w[48] = __byte_perm (w[ 5], w[ 4], selector); + w[47] = __byte_perm (w[ 4], w[ 3], selector); + w[46] = __byte_perm (w[ 3], w[ 2], selector); + w[45] = __byte_perm (w[ 2], w[ 1], selector); + w[44] = __byte_perm (w[ 1], w[ 0], selector); + w[43] = __byte_perm (w[ 0], 0, selector); + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 44: + w[63] = __byte_perm (w[19], w[18], selector); + w[62] = __byte_perm (w[18], w[17], selector); + w[61] = __byte_perm (w[17], w[16], selector); + w[60] = __byte_perm (w[16], w[15], selector); + w[59] = __byte_perm (w[15], w[14], selector); + w[58] = __byte_perm (w[14], w[13], selector); + w[57] = __byte_perm (w[13], w[12], selector); + w[56] = __byte_perm (w[12], w[11], selector); + w[55] = __byte_perm (w[11], w[10], selector); + w[54] = __byte_perm (w[10], w[ 9], selector); + w[53] = __byte_perm (w[ 9], w[ 8], selector); + w[52] = __byte_perm (w[ 8], w[ 7], selector); + w[51] = __byte_perm (w[ 7], w[ 6], selector); + w[50] = __byte_perm (w[ 6], w[ 5], selector); + w[49] = __byte_perm (w[ 5], w[ 4], selector); + w[48] = __byte_perm (w[ 4], w[ 3], selector); + w[47] = __byte_perm (w[ 3], w[ 2], selector); + w[46] = __byte_perm (w[ 2], w[ 1], selector); + w[45] = __byte_perm (w[ 1], w[ 0], selector); + w[44] = __byte_perm (w[ 0], 0, selector); + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 45: + w[63] = __byte_perm (w[18], w[17], selector); + w[62] = __byte_perm (w[17], w[16], selector); + w[61] = __byte_perm (w[16], w[15], selector); + w[60] = __byte_perm (w[15], w[14], selector); + w[59] = __byte_perm (w[14], w[13], selector); + w[58] = __byte_perm (w[13], w[12], selector); + w[57] = __byte_perm (w[12], w[11], selector); + w[56] = __byte_perm (w[11], w[10], selector); + w[55] = __byte_perm (w[10], w[ 9], selector); + w[54] = __byte_perm (w[ 9], w[ 8], selector); + w[53] = __byte_perm (w[ 8], w[ 7], selector); + w[52] = __byte_perm (w[ 7], w[ 6], selector); + w[51] = __byte_perm (w[ 6], w[ 5], selector); + w[50] = __byte_perm (w[ 5], w[ 4], selector); + w[49] = __byte_perm (w[ 4], w[ 3], selector); + w[48] = __byte_perm (w[ 3], w[ 2], selector); + w[47] = __byte_perm (w[ 2], w[ 1], selector); + w[46] = __byte_perm (w[ 1], w[ 0], selector); + w[45] = __byte_perm (w[ 0], 0, selector); + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 46: + w[63] = __byte_perm (w[17], w[16], selector); + w[62] = __byte_perm (w[16], w[15], selector); + w[61] = __byte_perm (w[15], w[14], selector); + w[60] = __byte_perm (w[14], w[13], selector); + w[59] = __byte_perm (w[13], w[12], selector); + w[58] = __byte_perm (w[12], w[11], selector); + w[57] = __byte_perm (w[11], w[10], selector); + w[56] = __byte_perm (w[10], w[ 9], selector); + w[55] = __byte_perm (w[ 9], w[ 8], selector); + w[54] = __byte_perm (w[ 8], w[ 7], selector); + w[53] = __byte_perm (w[ 7], w[ 6], selector); + w[52] = __byte_perm (w[ 6], w[ 5], selector); + w[51] = __byte_perm (w[ 5], w[ 4], selector); + w[50] = __byte_perm (w[ 4], w[ 3], selector); + w[49] = __byte_perm (w[ 3], w[ 2], selector); + w[48] = __byte_perm (w[ 2], w[ 1], selector); + w[47] = __byte_perm (w[ 1], w[ 0], selector); + w[46] = __byte_perm (w[ 0], 0, selector); + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 47: + w[63] = __byte_perm (w[16], w[15], selector); + w[62] = __byte_perm (w[15], w[14], selector); + w[61] = __byte_perm (w[14], w[13], selector); + w[60] = __byte_perm (w[13], w[12], selector); + w[59] = __byte_perm (w[12], w[11], selector); + w[58] = __byte_perm (w[11], w[10], selector); + w[57] = __byte_perm (w[10], w[ 9], selector); + w[56] = __byte_perm (w[ 9], w[ 8], selector); + w[55] = __byte_perm (w[ 8], w[ 7], selector); + w[54] = __byte_perm (w[ 7], w[ 6], selector); + w[53] = __byte_perm (w[ 6], w[ 5], selector); + w[52] = __byte_perm (w[ 5], w[ 4], selector); + w[51] = __byte_perm (w[ 4], w[ 3], selector); + w[50] = __byte_perm (w[ 3], w[ 2], selector); + w[49] = __byte_perm (w[ 2], w[ 1], selector); + w[48] = __byte_perm (w[ 1], w[ 0], selector); + w[47] = __byte_perm (w[ 0], 0, selector); + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 48: + w[63] = __byte_perm (w[15], w[14], selector); + w[62] = __byte_perm (w[14], w[13], selector); + w[61] = __byte_perm (w[13], w[12], selector); + w[60] = __byte_perm (w[12], w[11], selector); + w[59] = __byte_perm (w[11], w[10], selector); + w[58] = __byte_perm (w[10], w[ 9], selector); + w[57] = __byte_perm (w[ 9], w[ 8], selector); + w[56] = __byte_perm (w[ 8], w[ 7], selector); + w[55] = __byte_perm (w[ 7], w[ 6], selector); + w[54] = __byte_perm (w[ 6], w[ 5], selector); + w[53] = __byte_perm (w[ 5], w[ 4], selector); + w[52] = __byte_perm (w[ 4], w[ 3], selector); + w[51] = __byte_perm (w[ 3], w[ 2], selector); + w[50] = __byte_perm (w[ 2], w[ 1], selector); + w[49] = __byte_perm (w[ 1], w[ 0], selector); + w[48] = __byte_perm (w[ 0], 0, selector); + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 49: + w[63] = __byte_perm (w[14], w[13], selector); + w[62] = __byte_perm (w[13], w[12], selector); + w[61] = __byte_perm (w[12], w[11], selector); + w[60] = __byte_perm (w[11], w[10], selector); + w[59] = __byte_perm (w[10], w[ 9], selector); + w[58] = __byte_perm (w[ 9], w[ 8], selector); + w[57] = __byte_perm (w[ 8], w[ 7], selector); + w[56] = __byte_perm (w[ 7], w[ 6], selector); + w[55] = __byte_perm (w[ 6], w[ 5], selector); + w[54] = __byte_perm (w[ 5], w[ 4], selector); + w[53] = __byte_perm (w[ 4], w[ 3], selector); + w[52] = __byte_perm (w[ 3], w[ 2], selector); + w[51] = __byte_perm (w[ 2], w[ 1], selector); + w[50] = __byte_perm (w[ 1], w[ 0], selector); + w[49] = __byte_perm (w[ 0], 0, selector); + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 50: + w[63] = __byte_perm (w[13], w[12], selector); + w[62] = __byte_perm (w[12], w[11], selector); + w[61] = __byte_perm (w[11], w[10], selector); + w[60] = __byte_perm (w[10], w[ 9], selector); + w[59] = __byte_perm (w[ 9], w[ 8], selector); + w[58] = __byte_perm (w[ 8], w[ 7], selector); + w[57] = __byte_perm (w[ 7], w[ 6], selector); + w[56] = __byte_perm (w[ 6], w[ 5], selector); + w[55] = __byte_perm (w[ 5], w[ 4], selector); + w[54] = __byte_perm (w[ 4], w[ 3], selector); + w[53] = __byte_perm (w[ 3], w[ 2], selector); + w[52] = __byte_perm (w[ 2], w[ 1], selector); + w[51] = __byte_perm (w[ 1], w[ 0], selector); + w[50] = __byte_perm (w[ 0], 0, selector); + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 51: + w[63] = __byte_perm (w[12], w[11], selector); + w[62] = __byte_perm (w[11], w[10], selector); + w[61] = __byte_perm (w[10], w[ 9], selector); + w[60] = __byte_perm (w[ 9], w[ 8], selector); + w[59] = __byte_perm (w[ 8], w[ 7], selector); + w[58] = __byte_perm (w[ 7], w[ 6], selector); + w[57] = __byte_perm (w[ 6], w[ 5], selector); + w[56] = __byte_perm (w[ 5], w[ 4], selector); + w[55] = __byte_perm (w[ 4], w[ 3], selector); + w[54] = __byte_perm (w[ 3], w[ 2], selector); + w[53] = __byte_perm (w[ 2], w[ 1], selector); + w[52] = __byte_perm (w[ 1], w[ 0], selector); + w[51] = __byte_perm (w[ 0], 0, selector); + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 52: + w[63] = __byte_perm (w[11], w[10], selector); + w[62] = __byte_perm (w[10], w[ 9], selector); + w[61] = __byte_perm (w[ 9], w[ 8], selector); + w[60] = __byte_perm (w[ 8], w[ 7], selector); + w[59] = __byte_perm (w[ 7], w[ 6], selector); + w[58] = __byte_perm (w[ 6], w[ 5], selector); + w[57] = __byte_perm (w[ 5], w[ 4], selector); + w[56] = __byte_perm (w[ 4], w[ 3], selector); + w[55] = __byte_perm (w[ 3], w[ 2], selector); + w[54] = __byte_perm (w[ 2], w[ 1], selector); + w[53] = __byte_perm (w[ 1], w[ 0], selector); + w[52] = __byte_perm (w[ 0], 0, selector); + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 53: + w[63] = __byte_perm (w[10], w[ 9], selector); + w[62] = __byte_perm (w[ 9], w[ 8], selector); + w[61] = __byte_perm (w[ 8], w[ 7], selector); + w[60] = __byte_perm (w[ 7], w[ 6], selector); + w[59] = __byte_perm (w[ 6], w[ 5], selector); + w[58] = __byte_perm (w[ 5], w[ 4], selector); + w[57] = __byte_perm (w[ 4], w[ 3], selector); + w[56] = __byte_perm (w[ 3], w[ 2], selector); + w[55] = __byte_perm (w[ 2], w[ 1], selector); + w[54] = __byte_perm (w[ 1], w[ 0], selector); + w[53] = __byte_perm (w[ 0], 0, selector); + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 54: + w[63] = __byte_perm (w[ 9], w[ 8], selector); + w[62] = __byte_perm (w[ 8], w[ 7], selector); + w[61] = __byte_perm (w[ 7], w[ 6], selector); + w[60] = __byte_perm (w[ 6], w[ 5], selector); + w[59] = __byte_perm (w[ 5], w[ 4], selector); + w[58] = __byte_perm (w[ 4], w[ 3], selector); + w[57] = __byte_perm (w[ 3], w[ 2], selector); + w[56] = __byte_perm (w[ 2], w[ 1], selector); + w[55] = __byte_perm (w[ 1], w[ 0], selector); + w[54] = __byte_perm (w[ 0], 0, selector); + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 55: + w[63] = __byte_perm (w[ 8], w[ 7], selector); + w[62] = __byte_perm (w[ 7], w[ 6], selector); + w[61] = __byte_perm (w[ 6], w[ 5], selector); + w[60] = __byte_perm (w[ 5], w[ 4], selector); + w[59] = __byte_perm (w[ 4], w[ 3], selector); + w[58] = __byte_perm (w[ 3], w[ 2], selector); + w[57] = __byte_perm (w[ 2], w[ 1], selector); + w[56] = __byte_perm (w[ 1], w[ 0], selector); + w[55] = __byte_perm (w[ 0], 0, selector); + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 56: + w[63] = __byte_perm (w[ 7], w[ 6], selector); + w[62] = __byte_perm (w[ 6], w[ 5], selector); + w[61] = __byte_perm (w[ 5], w[ 4], selector); + w[60] = __byte_perm (w[ 4], w[ 3], selector); + w[59] = __byte_perm (w[ 3], w[ 2], selector); + w[58] = __byte_perm (w[ 2], w[ 1], selector); + w[57] = __byte_perm (w[ 1], w[ 0], selector); + w[56] = __byte_perm (w[ 0], 0, selector); + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 57: + w[63] = __byte_perm (w[ 6], w[ 5], selector); + w[62] = __byte_perm (w[ 5], w[ 4], selector); + w[61] = __byte_perm (w[ 4], w[ 3], selector); + w[60] = __byte_perm (w[ 3], w[ 2], selector); + w[59] = __byte_perm (w[ 2], w[ 1], selector); + w[58] = __byte_perm (w[ 1], w[ 0], selector); + w[57] = __byte_perm (w[ 0], 0, selector); + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 58: + w[63] = __byte_perm (w[ 5], w[ 4], selector); + w[62] = __byte_perm (w[ 4], w[ 3], selector); + w[61] = __byte_perm (w[ 3], w[ 2], selector); + w[60] = __byte_perm (w[ 2], w[ 1], selector); + w[59] = __byte_perm (w[ 1], w[ 0], selector); + w[58] = __byte_perm (w[ 0], 0, selector); + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 59: + w[63] = __byte_perm (w[ 4], w[ 3], selector); + w[62] = __byte_perm (w[ 3], w[ 2], selector); + w[61] = __byte_perm (w[ 2], w[ 1], selector); + w[60] = __byte_perm (w[ 1], w[ 0], selector); + w[59] = __byte_perm (w[ 0], 0, selector); + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 60: + w[63] = __byte_perm (w[ 3], w[ 2], selector); + w[62] = __byte_perm (w[ 2], w[ 1], selector); + w[61] = __byte_perm (w[ 1], w[ 0], selector); + w[60] = __byte_perm (w[ 0], 0, selector); + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 61: + w[63] = __byte_perm (w[ 2], w[ 1], selector); + w[62] = __byte_perm (w[ 1], w[ 0], selector); + w[61] = __byte_perm (w[ 0], 0, selector); + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 62: + w[63] = __byte_perm (w[ 1], w[ 0], selector); + w[62] = __byte_perm (w[ 0], 0, selector); + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 63: + w[63] = __byte_perm (w[ 0], 0, selector); + w[62] = 0; + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + } + #endif +} + void overwrite_at_le (u32x sw[16], const u32x w0, const u32 salt_len) { #if defined cl_amd_media_ops @@ -15045,244 +33029,156 @@ void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], c const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); + w1[1] = swap32_S (w1[1]); + w1[2] = swap32_S (w1[2]); + w1[3] = swap32_S (w1[3]); + w2[0] = swap32_S (w2[0]); + w2[1] = swap32_S (w2[1]); + w2[2] = swap32_S (w2[2]); + w2[3] = swap32_S (w2[3]); + w3[0] = swap32_S (w3[0]); + w3[1] = swap32_S (w3[1]); + w3[2] = swap32_S (w3[2]); + w3[3] = swap32_S (w3[3]); + switch (offset / 4) { case 0: - w3[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w3[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - - if (offset_mod_4 == 0) - { - w0[0] = w0[1]; - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } + w3[3] = amd_bytealign_S (w3[2], w3[3], offset); + w3[2] = amd_bytealign_S (w3[1], w3[2], offset); + w3[1] = amd_bytealign_S (w3[0], w3[1], offset); + w3[0] = amd_bytealign_S (w2[3], w3[0], offset); + w2[3] = amd_bytealign_S (w2[2], w2[3], offset); + w2[2] = amd_bytealign_S (w2[1], w2[2], offset); + w2[1] = amd_bytealign_S (w2[0], w2[1], offset); + w2[0] = amd_bytealign_S (w1[3], w2[0], offset); + w1[3] = amd_bytealign_S (w1[2], w1[3], offset); + w1[2] = amd_bytealign_S (w1[1], w1[2], offset); + w1[1] = amd_bytealign_S (w1[0], w1[1], offset); + w1[0] = amd_bytealign_S (w0[3], w1[0], offset); + w0[3] = amd_bytealign_S (w0[2], w0[3], offset); + w0[2] = amd_bytealign_S (w0[1], w0[2], offset); + w0[1] = amd_bytealign_S (w0[0], w0[1], offset); + w0[0] = amd_bytealign_S ( 0, w0[0], offset); break; case 1: - w3[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w3[1], w3[2], offset); + w3[2] = amd_bytealign_S (w3[0], w3[1], offset); + w3[1] = amd_bytealign_S (w2[3], w3[0], offset); + w3[0] = amd_bytealign_S (w2[2], w2[3], offset); + w2[3] = amd_bytealign_S (w2[1], w2[2], offset); + w2[2] = amd_bytealign_S (w2[0], w2[1], offset); + w2[1] = amd_bytealign_S (w1[3], w2[0], offset); + w2[0] = amd_bytealign_S (w1[2], w1[3], offset); + w1[3] = amd_bytealign_S (w1[1], w1[2], offset); + w1[2] = amd_bytealign_S (w1[0], w1[1], offset); + w1[1] = amd_bytealign_S (w0[3], w1[0], offset); + w1[0] = amd_bytealign_S (w0[2], w0[3], offset); + w0[3] = amd_bytealign_S (w0[1], w0[2], offset); + w0[2] = amd_bytealign_S (w0[0], w0[1], offset); + w0[1] = amd_bytealign_S ( 0, w0[0], offset); w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 2: - w3[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w3[0], w3[1], offset); + w3[2] = amd_bytealign_S (w2[3], w3[0], offset); + w3[1] = amd_bytealign_S (w2[2], w2[3], offset); + w3[0] = amd_bytealign_S (w2[1], w2[2], offset); + w2[3] = amd_bytealign_S (w2[0], w2[1], offset); + w2[2] = amd_bytealign_S (w1[3], w2[0], offset); + w2[1] = amd_bytealign_S (w1[2], w1[3], offset); + w2[0] = amd_bytealign_S (w1[1], w1[2], offset); + w1[3] = amd_bytealign_S (w1[0], w1[1], offset); + w1[2] = amd_bytealign_S (w0[3], w1[0], offset); + w1[1] = amd_bytealign_S (w0[2], w0[3], offset); + w1[0] = amd_bytealign_S (w0[1], w0[2], offset); + w0[3] = amd_bytealign_S (w0[0], w0[1], offset); + w0[2] = amd_bytealign_S ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 3: - w3[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w2[3], w3[0], offset); + w3[2] = amd_bytealign_S (w2[2], w2[3], offset); + w3[1] = amd_bytealign_S (w2[1], w2[2], offset); + w3[0] = amd_bytealign_S (w2[0], w2[1], offset); + w2[3] = amd_bytealign_S (w1[3], w2[0], offset); + w2[2] = amd_bytealign_S (w1[2], w1[3], offset); + w2[1] = amd_bytealign_S (w1[1], w1[2], offset); + w2[0] = amd_bytealign_S (w1[0], w1[1], offset); + w1[3] = amd_bytealign_S (w0[3], w1[0], offset); + w1[2] = amd_bytealign_S (w0[2], w0[3], offset); + w1[1] = amd_bytealign_S (w0[1], w0[2], offset); + w1[0] = amd_bytealign_S (w0[0], w0[1], offset); + w0[3] = amd_bytealign_S ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 4: - w3[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w2[2], w2[3], offset); + w3[2] = amd_bytealign_S (w2[1], w2[2], offset); + w3[1] = amd_bytealign_S (w2[0], w2[1], offset); + w3[0] = amd_bytealign_S (w1[3], w2[0], offset); + w2[3] = amd_bytealign_S (w1[2], w1[3], offset); + w2[2] = amd_bytealign_S (w1[1], w1[2], offset); + w2[1] = amd_bytealign_S (w1[0], w1[1], offset); + w2[0] = amd_bytealign_S (w0[3], w1[0], offset); + w1[3] = amd_bytealign_S (w0[2], w0[3], offset); + w1[2] = amd_bytealign_S (w0[1], w0[2], offset); + w1[1] = amd_bytealign_S (w0[0], w0[1], offset); + w1[0] = amd_bytealign_S ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 5: - w3[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w2[1], w2[2], offset); + w3[2] = amd_bytealign_S (w2[0], w2[1], offset); + w3[1] = amd_bytealign_S (w1[3], w2[0], offset); + w3[0] = amd_bytealign_S (w1[2], w1[3], offset); + w2[3] = amd_bytealign_S (w1[1], w1[2], offset); + w2[2] = amd_bytealign_S (w1[0], w1[1], offset); + w2[1] = amd_bytealign_S (w0[3], w1[0], offset); + w2[0] = amd_bytealign_S (w0[2], w0[3], offset); + w1[3] = amd_bytealign_S (w0[1], w0[2], offset); + w1[2] = amd_bytealign_S (w0[0], w0[1], offset); + w1[1] = amd_bytealign_S ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 6: - w3[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w2[0], w2[1], offset); + w3[2] = amd_bytealign_S (w1[3], w2[0], offset); + w3[1] = amd_bytealign_S (w1[2], w1[3], offset); + w3[0] = amd_bytealign_S (w1[1], w1[2], offset); + w2[3] = amd_bytealign_S (w1[0], w1[1], offset); + w2[2] = amd_bytealign_S (w0[3], w1[0], offset); + w2[1] = amd_bytealign_S (w0[2], w0[3], offset); + w2[0] = amd_bytealign_S (w0[1], w0[2], offset); + w1[3] = amd_bytealign_S (w0[0], w0[1], offset); + w1[2] = amd_bytealign_S ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -15290,32 +33186,18 @@ void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], c w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 7: - w3[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w1[3], w2[0], offset); + w3[2] = amd_bytealign_S (w1[2], w1[3], offset); + w3[1] = amd_bytealign_S (w1[1], w1[2], offset); + w3[0] = amd_bytealign_S (w1[0], w1[1], offset); + w2[3] = amd_bytealign_S (w0[3], w1[0], offset); + w2[2] = amd_bytealign_S (w0[2], w0[3], offset); + w2[1] = amd_bytealign_S (w0[1], w0[2], offset); + w2[0] = amd_bytealign_S (w0[0], w0[1], offset); + w1[3] = amd_bytealign_S ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -15324,30 +33206,17 @@ void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], c w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 8: - w3[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w1[2], w1[3], offset); + w3[2] = amd_bytealign_S (w1[1], w1[2], offset); + w3[1] = amd_bytealign_S (w1[0], w1[1], offset); + w3[0] = amd_bytealign_S (w0[3], w1[0], offset); + w2[3] = amd_bytealign_S (w0[2], w0[3], offset); + w2[2] = amd_bytealign_S (w0[1], w0[2], offset); + w2[1] = amd_bytealign_S (w0[0], w0[1], offset); + w2[0] = amd_bytealign_S ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -15357,28 +33226,16 @@ void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], c w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 9: - w3[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w1[1], w1[2], offset); + w3[2] = amd_bytealign_S (w1[0], w1[1], offset); + w3[1] = amd_bytealign_S (w0[3], w1[0], offset); + w3[0] = amd_bytealign_S (w0[2], w0[3], offset); + w2[3] = amd_bytealign_S (w0[1], w0[2], offset); + w2[2] = amd_bytealign_S (w0[0], w0[1], offset); + w2[1] = amd_bytealign_S ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -15389,26 +33246,15 @@ void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], c w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 10: - w3[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w1[0], w1[1], offset); + w3[2] = amd_bytealign_S (w0[3], w1[0], offset); + w3[1] = amd_bytealign_S (w0[2], w0[3], offset); + w3[0] = amd_bytealign_S (w0[1], w0[2], offset); + w2[3] = amd_bytealign_S (w0[0], w0[1], offset); + w2[2] = amd_bytealign_S ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -15420,24 +33266,14 @@ void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], c w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 11: - w3[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w0[3], w1[0], offset); + w3[2] = amd_bytealign_S (w0[2], w0[3], offset); + w3[1] = amd_bytealign_S (w0[1], w0[2], offset); + w3[0] = amd_bytealign_S (w0[0], w0[1], offset); + w2[3] = amd_bytealign_S ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -15450,22 +33286,13 @@ void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], c w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 12: - w3[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w0[2], w0[3], offset); + w3[2] = amd_bytealign_S (w0[1], w0[2], offset); + w3[1] = amd_bytealign_S (w0[0], w0[1], offset); + w3[0] = amd_bytealign_S ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -15479,20 +33306,12 @@ void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], c w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 13: - w3[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w0[1], w0[2], offset); + w3[2] = amd_bytealign_S (w0[0], w0[1], offset); + w3[1] = amd_bytealign_S ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -15507,18 +33326,11 @@ void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], c w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 14: - w3[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w0[0], w0[1], offset); + w3[2] = amd_bytealign_S ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -15534,16 +33346,10 @@ void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], c w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 15: - w3[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -15560,13 +33366,25 @@ void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], c w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[3] = 0; - } - break; } + + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); + w1[1] = swap32_S (w1[1]); + w1[2] = swap32_S (w1[2]); + w1[3] = swap32_S (w1[3]); + w2[0] = swap32_S (w2[0]); + w2[1] = swap32_S (w2[1]); + w2[2] = swap32_S (w2[2]); + w2[3] = swap32_S (w2[3]); + w3[0] = swap32_S (w3[0]); + w3[1] = swap32_S (w3[1]); + w3[2] = swap32_S (w3[2]); + w3[3] = swap32_S (w3[3]); #endif #ifdef IS_NV @@ -15903,6 +33721,519 @@ void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3 const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); + w1[1] = swap32_S (w1[1]); + w1[2] = swap32_S (w1[2]); + w1[3] = swap32_S (w1[3]); + w2[0] = swap32_S (w2[0]); + w2[1] = swap32_S (w2[1]); + w2[2] = swap32_S (w2[2]); + w2[3] = swap32_S (w2[3]); + w3[0] = swap32_S (w3[0]); + w3[1] = swap32_S (w3[1]); + w3[2] = swap32_S (w3[2]); + w3[3] = swap32_S (w3[3]); + + switch (offset / 4) + { + case 0: + c0[0] = amd_bytealign_S (w3[3], 0, offset); + w3[3] = amd_bytealign_S (w3[2], w3[3], offset); + w3[2] = amd_bytealign_S (w3[1], w3[2], offset); + w3[1] = amd_bytealign_S (w3[0], w3[1], offset); + w3[0] = amd_bytealign_S (w2[3], w3[0], offset); + w2[3] = amd_bytealign_S (w2[2], w2[3], offset); + w2[2] = amd_bytealign_S (w2[1], w2[2], offset); + w2[1] = amd_bytealign_S (w2[0], w2[1], offset); + w2[0] = amd_bytealign_S (w1[3], w2[0], offset); + w1[3] = amd_bytealign_S (w1[2], w1[3], offset); + w1[2] = amd_bytealign_S (w1[1], w1[2], offset); + w1[1] = amd_bytealign_S (w1[0], w1[1], offset); + w1[0] = amd_bytealign_S (w0[3], w1[0], offset); + w0[3] = amd_bytealign_S (w0[2], w0[3], offset); + w0[2] = amd_bytealign_S (w0[1], w0[2], offset); + w0[1] = amd_bytealign_S (w0[0], w0[1], offset); + w0[0] = amd_bytealign_S ( 0, w0[0], offset); + + break; + + case 1: + c0[1] = amd_bytealign_S (w3[3], 0, offset); + c0[0] = amd_bytealign_S (w3[2], w3[3], offset); + w3[3] = amd_bytealign_S (w3[1], w3[2], offset); + w3[2] = amd_bytealign_S (w3[0], w3[1], offset); + w3[1] = amd_bytealign_S (w2[3], w3[0], offset); + w3[0] = amd_bytealign_S (w2[2], w2[3], offset); + w2[3] = amd_bytealign_S (w2[1], w2[2], offset); + w2[2] = amd_bytealign_S (w2[0], w2[1], offset); + w2[1] = amd_bytealign_S (w1[3], w2[0], offset); + w2[0] = amd_bytealign_S (w1[2], w1[3], offset); + w1[3] = amd_bytealign_S (w1[1], w1[2], offset); + w1[2] = amd_bytealign_S (w1[0], w1[1], offset); + w1[1] = amd_bytealign_S (w0[3], w1[0], offset); + w1[0] = amd_bytealign_S (w0[2], w0[3], offset); + w0[3] = amd_bytealign_S (w0[1], w0[2], offset); + w0[2] = amd_bytealign_S (w0[0], w0[1], offset); + w0[1] = amd_bytealign_S ( 0, w0[0], offset); + w0[0] = 0; + + break; + + case 2: + c0[2] = amd_bytealign_S (w3[3], 0, offset); + c0[1] = amd_bytealign_S (w3[2], w3[3], offset); + c0[0] = amd_bytealign_S (w3[1], w3[2], offset); + w3[3] = amd_bytealign_S (w3[0], w3[1], offset); + w3[2] = amd_bytealign_S (w2[3], w3[0], offset); + w3[1] = amd_bytealign_S (w2[2], w2[3], offset); + w3[0] = amd_bytealign_S (w2[1], w2[2], offset); + w2[3] = amd_bytealign_S (w2[0], w2[1], offset); + w2[2] = amd_bytealign_S (w1[3], w2[0], offset); + w2[1] = amd_bytealign_S (w1[2], w1[3], offset); + w2[0] = amd_bytealign_S (w1[1], w1[2], offset); + w1[3] = amd_bytealign_S (w1[0], w1[1], offset); + w1[2] = amd_bytealign_S (w0[3], w1[0], offset); + w1[1] = amd_bytealign_S (w0[2], w0[3], offset); + w1[0] = amd_bytealign_S (w0[1], w0[2], offset); + w0[3] = amd_bytealign_S (w0[0], w0[1], offset); + w0[2] = amd_bytealign_S ( 0, w0[0], offset); + w0[1] = 0; + w0[0] = 0; + + break; + + case 3: + c0[3] = amd_bytealign_S (w3[3], 0, offset); + c0[2] = amd_bytealign_S (w3[2], w3[3], offset); + c0[1] = amd_bytealign_S (w3[1], w3[2], offset); + c0[0] = amd_bytealign_S (w3[0], w3[1], offset); + w3[3] = amd_bytealign_S (w2[3], w3[0], offset); + w3[2] = amd_bytealign_S (w2[2], w2[3], offset); + w3[1] = amd_bytealign_S (w2[1], w2[2], offset); + w3[0] = amd_bytealign_S (w2[0], w2[1], offset); + w2[3] = amd_bytealign_S (w1[3], w2[0], offset); + w2[2] = amd_bytealign_S (w1[2], w1[3], offset); + w2[1] = amd_bytealign_S (w1[1], w1[2], offset); + w2[0] = amd_bytealign_S (w1[0], w1[1], offset); + w1[3] = amd_bytealign_S (w0[3], w1[0], offset); + w1[2] = amd_bytealign_S (w0[2], w0[3], offset); + w1[1] = amd_bytealign_S (w0[1], w0[2], offset); + w1[0] = amd_bytealign_S (w0[0], w0[1], offset); + w0[3] = amd_bytealign_S ( 0, w0[0], offset); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 4: + c1[0] = amd_bytealign_S (w3[3], 0, offset); + c0[3] = amd_bytealign_S (w3[2], w3[3], offset); + c0[2] = amd_bytealign_S (w3[1], w3[2], offset); + c0[1] = amd_bytealign_S (w3[0], w3[1], offset); + c0[0] = amd_bytealign_S (w2[3], w3[0], offset); + w3[3] = amd_bytealign_S (w2[2], w2[3], offset); + w3[2] = amd_bytealign_S (w2[1], w2[2], offset); + w3[1] = amd_bytealign_S (w2[0], w2[1], offset); + w3[0] = amd_bytealign_S (w1[3], w2[0], offset); + w2[3] = amd_bytealign_S (w1[2], w1[3], offset); + w2[2] = amd_bytealign_S (w1[1], w1[2], offset); + w2[1] = amd_bytealign_S (w1[0], w1[1], offset); + w2[0] = amd_bytealign_S (w0[3], w1[0], offset); + w1[3] = amd_bytealign_S (w0[2], w0[3], offset); + w1[2] = amd_bytealign_S (w0[1], w0[2], offset); + w1[1] = amd_bytealign_S (w0[0], w0[1], offset); + w1[0] = amd_bytealign_S ( 0, w0[0], offset); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 5: + c1[1] = amd_bytealign_S (w3[3], 0, offset); + c1[0] = amd_bytealign_S (w3[2], w3[3], offset); + c0[3] = amd_bytealign_S (w3[1], w3[2], offset); + c0[2] = amd_bytealign_S (w3[0], w3[1], offset); + c0[1] = amd_bytealign_S (w2[3], w3[0], offset); + c0[0] = amd_bytealign_S (w2[2], w2[3], offset); + w3[3] = amd_bytealign_S (w2[1], w2[2], offset); + w3[2] = amd_bytealign_S (w2[0], w2[1], offset); + w3[1] = amd_bytealign_S (w1[3], w2[0], offset); + w3[0] = amd_bytealign_S (w1[2], w1[3], offset); + w2[3] = amd_bytealign_S (w1[1], w1[2], offset); + w2[2] = amd_bytealign_S (w1[0], w1[1], offset); + w2[1] = amd_bytealign_S (w0[3], w1[0], offset); + w2[0] = amd_bytealign_S (w0[2], w0[3], offset); + w1[3] = amd_bytealign_S (w0[1], w0[2], offset); + w1[2] = amd_bytealign_S (w0[0], w0[1], offset); + w1[1] = amd_bytealign_S ( 0, w0[0], offset); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 6: + c1[2] = amd_bytealign_S (w3[3], 0, offset); + c1[1] = amd_bytealign_S (w3[2], w3[3], offset); + c1[0] = amd_bytealign_S (w3[1], w3[2], offset); + c0[3] = amd_bytealign_S (w3[0], w3[1], offset); + c0[2] = amd_bytealign_S (w2[3], w3[0], offset); + c0[1] = amd_bytealign_S (w2[2], w2[3], offset); + c0[0] = amd_bytealign_S (w2[1], w2[2], offset); + w3[3] = amd_bytealign_S (w2[0], w2[1], offset); + w3[2] = amd_bytealign_S (w1[3], w2[0], offset); + w3[1] = amd_bytealign_S (w1[2], w1[3], offset); + w3[0] = amd_bytealign_S (w1[1], w1[2], offset); + w2[3] = amd_bytealign_S (w1[0], w1[1], offset); + w2[2] = amd_bytealign_S (w0[3], w1[0], offset); + w2[1] = amd_bytealign_S (w0[2], w0[3], offset); + w2[0] = amd_bytealign_S (w0[1], w0[2], offset); + w1[3] = amd_bytealign_S (w0[0], w0[1], offset); + w1[2] = amd_bytealign_S ( 0, w0[0], offset); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 7: + c1[3] = amd_bytealign_S (w3[3], 0, offset); + c1[2] = amd_bytealign_S (w3[2], w3[3], offset); + c1[1] = amd_bytealign_S (w3[1], w3[2], offset); + c1[0] = amd_bytealign_S (w3[0], w3[1], offset); + c0[3] = amd_bytealign_S (w2[3], w3[0], offset); + c0[2] = amd_bytealign_S (w2[2], w2[3], offset); + c0[1] = amd_bytealign_S (w2[1], w2[2], offset); + c0[0] = amd_bytealign_S (w2[0], w2[1], offset); + w3[3] = amd_bytealign_S (w1[3], w2[0], offset); + w3[2] = amd_bytealign_S (w1[2], w1[3], offset); + w3[1] = amd_bytealign_S (w1[1], w1[2], offset); + w3[0] = amd_bytealign_S (w1[0], w1[1], offset); + w2[3] = amd_bytealign_S (w0[3], w1[0], offset); + w2[2] = amd_bytealign_S (w0[2], w0[3], offset); + w2[1] = amd_bytealign_S (w0[1], w0[2], offset); + w2[0] = amd_bytealign_S (w0[0], w0[1], offset); + w1[3] = amd_bytealign_S ( 0, w0[0], offset); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 8: + c2[0] = amd_bytealign_S (w3[3], 0, offset); + c1[3] = amd_bytealign_S (w3[2], w3[3], offset); + c1[2] = amd_bytealign_S (w3[1], w3[2], offset); + c1[1] = amd_bytealign_S (w3[0], w3[1], offset); + c1[0] = amd_bytealign_S (w2[3], w3[0], offset); + c0[3] = amd_bytealign_S (w2[2], w2[3], offset); + c0[2] = amd_bytealign_S (w2[1], w2[2], offset); + c0[1] = amd_bytealign_S (w2[0], w2[1], offset); + c0[0] = amd_bytealign_S (w1[3], w2[0], offset); + w3[3] = amd_bytealign_S (w1[2], w1[3], offset); + w3[2] = amd_bytealign_S (w1[1], w1[2], offset); + w3[1] = amd_bytealign_S (w1[0], w1[1], offset); + w3[0] = amd_bytealign_S (w0[3], w1[0], offset); + w2[3] = amd_bytealign_S (w0[2], w0[3], offset); + w2[2] = amd_bytealign_S (w0[1], w0[2], offset); + w2[1] = amd_bytealign_S (w0[0], w0[1], offset); + w2[0] = amd_bytealign_S ( 0, w0[0], offset); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 9: + c2[1] = amd_bytealign_S (w3[3], 0, offset); + c2[0] = amd_bytealign_S (w3[2], w3[3], offset); + c1[3] = amd_bytealign_S (w3[1], w3[2], offset); + c1[2] = amd_bytealign_S (w3[0], w3[1], offset); + c1[1] = amd_bytealign_S (w2[3], w3[0], offset); + c1[0] = amd_bytealign_S (w2[2], w2[3], offset); + c0[3] = amd_bytealign_S (w2[1], w2[2], offset); + c0[2] = amd_bytealign_S (w2[0], w2[1], offset); + c0[1] = amd_bytealign_S (w1[3], w2[0], offset); + c0[0] = amd_bytealign_S (w1[2], w1[3], offset); + w3[3] = amd_bytealign_S (w1[1], w1[2], offset); + w3[2] = amd_bytealign_S (w1[0], w1[1], offset); + w3[1] = amd_bytealign_S (w0[3], w1[0], offset); + w3[0] = amd_bytealign_S (w0[2], w0[3], offset); + w2[3] = amd_bytealign_S (w0[1], w0[2], offset); + w2[2] = amd_bytealign_S (w0[0], w0[1], offset); + w2[1] = amd_bytealign_S ( 0, w0[0], offset); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 10: + c2[2] = amd_bytealign_S (w3[3], 0, offset); + c2[1] = amd_bytealign_S (w3[2], w3[3], offset); + c2[0] = amd_bytealign_S (w3[1], w3[2], offset); + c1[3] = amd_bytealign_S (w3[0], w3[1], offset); + c1[2] = amd_bytealign_S (w2[3], w3[0], offset); + c1[1] = amd_bytealign_S (w2[2], w2[3], offset); + c1[0] = amd_bytealign_S (w2[1], w2[2], offset); + c0[3] = amd_bytealign_S (w2[0], w2[1], offset); + c0[2] = amd_bytealign_S (w1[3], w2[0], offset); + c0[1] = amd_bytealign_S (w1[2], w1[3], offset); + c0[0] = amd_bytealign_S (w1[1], w1[2], offset); + w3[3] = amd_bytealign_S (w1[0], w1[1], offset); + w3[2] = amd_bytealign_S (w0[3], w1[0], offset); + w3[1] = amd_bytealign_S (w0[2], w0[3], offset); + w3[0] = amd_bytealign_S (w0[1], w0[2], offset); + w2[3] = amd_bytealign_S (w0[0], w0[1], offset); + w2[2] = amd_bytealign_S ( 0, w0[0], offset); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 11: + c2[3] = amd_bytealign_S (w3[3], 0, offset); + c2[2] = amd_bytealign_S (w3[2], w3[3], offset); + c2[1] = amd_bytealign_S (w3[1], w3[2], offset); + c2[0] = amd_bytealign_S (w3[0], w3[1], offset); + c1[3] = amd_bytealign_S (w2[3], w3[0], offset); + c1[2] = amd_bytealign_S (w2[2], w2[3], offset); + c1[1] = amd_bytealign_S (w2[1], w2[2], offset); + c1[0] = amd_bytealign_S (w2[0], w2[1], offset); + c0[3] = amd_bytealign_S (w1[3], w2[0], offset); + c0[2] = amd_bytealign_S (w1[2], w1[3], offset); + c0[1] = amd_bytealign_S (w1[1], w1[2], offset); + c0[0] = amd_bytealign_S (w1[0], w1[1], offset); + w3[3] = amd_bytealign_S (w0[3], w1[0], offset); + w3[2] = amd_bytealign_S (w0[2], w0[3], offset); + w3[1] = amd_bytealign_S (w0[1], w0[2], offset); + w3[0] = amd_bytealign_S (w0[0], w0[1], offset); + w2[3] = amd_bytealign_S ( 0, w0[0], offset); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 12: + c3[0] = amd_bytealign_S (w3[3], 0, offset); + c2[3] = amd_bytealign_S (w3[2], w3[3], offset); + c2[2] = amd_bytealign_S (w3[1], w3[2], offset); + c2[1] = amd_bytealign_S (w3[0], w3[1], offset); + c2[0] = amd_bytealign_S (w2[3], w3[0], offset); + c1[3] = amd_bytealign_S (w2[2], w2[3], offset); + c1[2] = amd_bytealign_S (w2[1], w2[2], offset); + c1[1] = amd_bytealign_S (w2[0], w2[1], offset); + c1[0] = amd_bytealign_S (w1[3], w2[0], offset); + c0[3] = amd_bytealign_S (w1[2], w1[3], offset); + c0[2] = amd_bytealign_S (w1[1], w1[2], offset); + c0[1] = amd_bytealign_S (w1[0], w1[1], offset); + c0[0] = amd_bytealign_S (w0[3], w1[0], offset); + w3[3] = amd_bytealign_S (w0[2], w0[3], offset); + w3[2] = amd_bytealign_S (w0[1], w0[2], offset); + w3[1] = amd_bytealign_S (w0[0], w0[1], offset); + w3[0] = amd_bytealign_S ( 0, w0[0], offset); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 13: + c3[1] = amd_bytealign_S (w3[3], 0, offset); + c3[0] = amd_bytealign_S (w3[2], w3[3], offset); + c2[3] = amd_bytealign_S (w3[1], w3[2], offset); + c2[2] = amd_bytealign_S (w3[0], w3[1], offset); + c2[1] = amd_bytealign_S (w2[3], w3[0], offset); + c2[0] = amd_bytealign_S (w2[2], w2[3], offset); + c1[3] = amd_bytealign_S (w2[1], w2[2], offset); + c1[2] = amd_bytealign_S (w2[0], w2[1], offset); + c1[1] = amd_bytealign_S (w1[3], w2[0], offset); + c1[0] = amd_bytealign_S (w1[2], w1[3], offset); + c0[3] = amd_bytealign_S (w1[1], w1[2], offset); + c0[2] = amd_bytealign_S (w1[0], w1[1], offset); + c0[1] = amd_bytealign_S (w0[3], w1[0], offset); + c0[0] = amd_bytealign_S (w0[2], w0[3], offset); + w3[3] = amd_bytealign_S (w0[1], w0[2], offset); + w3[2] = amd_bytealign_S (w0[0], w0[1], offset); + w3[1] = amd_bytealign_S ( 0, w0[0], offset); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 14: + c3[2] = amd_bytealign_S (w3[3], 0, offset); + c3[1] = amd_bytealign_S (w3[2], w3[3], offset); + c3[0] = amd_bytealign_S (w3[1], w3[2], offset); + c2[3] = amd_bytealign_S (w3[0], w3[1], offset); + c2[2] = amd_bytealign_S (w2[3], w3[0], offset); + c2[1] = amd_bytealign_S (w2[2], w2[3], offset); + c2[0] = amd_bytealign_S (w2[1], w2[2], offset); + c1[3] = amd_bytealign_S (w2[0], w2[1], offset); + c1[2] = amd_bytealign_S (w1[3], w2[0], offset); + c1[1] = amd_bytealign_S (w1[2], w1[3], offset); + c1[0] = amd_bytealign_S (w1[1], w1[2], offset); + c0[3] = amd_bytealign_S (w1[0], w1[1], offset); + c0[2] = amd_bytealign_S (w0[3], w1[0], offset); + c0[1] = amd_bytealign_S (w0[2], w0[3], offset); + c0[0] = amd_bytealign_S (w0[1], w0[2], offset); + w3[3] = amd_bytealign_S (w0[0], w0[1], offset); + w3[2] = amd_bytealign_S ( 0, w0[0], offset); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 15: + c3[3] = amd_bytealign_S (w3[3], 0, offset); + c3[2] = amd_bytealign_S (w3[2], w3[3], offset); + c3[1] = amd_bytealign_S (w3[1], w3[2], offset); + c3[0] = amd_bytealign_S (w3[0], w3[1], offset); + c2[3] = amd_bytealign_S (w2[3], w3[0], offset); + c2[2] = amd_bytealign_S (w2[2], w2[3], offset); + c2[1] = amd_bytealign_S (w2[1], w2[2], offset); + c2[0] = amd_bytealign_S (w2[0], w2[1], offset); + c1[3] = amd_bytealign_S (w1[3], w2[0], offset); + c1[2] = amd_bytealign_S (w1[2], w1[3], offset); + c1[1] = amd_bytealign_S (w1[1], w1[2], offset); + c1[0] = amd_bytealign_S (w1[0], w1[1], offset); + c0[3] = amd_bytealign_S (w0[3], w1[0], offset); + c0[2] = amd_bytealign_S (w0[2], w0[3], offset); + c0[1] = amd_bytealign_S (w0[1], w0[2], offset); + c0[0] = amd_bytealign_S (w0[0], w0[1], offset); + w3[3] = amd_bytealign_S ( 0, w0[0], offset); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + } + + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); + w1[1] = swap32_S (w1[1]); + w1[2] = swap32_S (w1[2]); + w1[3] = swap32_S (w1[3]); + w2[0] = swap32_S (w2[0]); + w2[1] = swap32_S (w2[1]); + w2[2] = swap32_S (w2[2]); + w2[3] = swap32_S (w2[3]); + w3[0] = swap32_S (w3[0]); + w3[1] = swap32_S (w3[1]); + w3[2] = swap32_S (w3[2]); + w3[3] = swap32_S (w3[3]); + c0[0] = swap32_S (c0[0]); + c0[1] = swap32_S (c0[1]); + c0[2] = swap32_S (c0[2]); + c0[3] = swap32_S (c0[3]); + c1[0] = swap32_S (c1[0]); + c1[1] = swap32_S (c1[1]); + c1[2] = swap32_S (c1[2]); + c1[3] = swap32_S (c1[3]); + c2[0] = swap32_S (c2[0]); + c2[1] = swap32_S (c2[1]); + c2[2] = swap32_S (c2[2]); + c2[3] = swap32_S (c2[3]); + c3[0] = swap32_S (c3[0]); + c3[1] = swap32_S (c3[1]); + c3[2] = swap32_S (c3[2]); + c3[3] = swap32_S (c3[3]); + #endif + + #ifdef IS_NV + // todo switch (offset / 4) { case 0: @@ -16697,6 +35028,7 @@ void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3 break; } + #endif } void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) @@ -18288,452 +36620,284 @@ void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4 const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); + w1[1] = swap32_S (w1[1]); + w1[2] = swap32_S (w1[2]); + w1[3] = swap32_S (w1[3]); + w2[0] = swap32_S (w2[0]); + w2[1] = swap32_S (w2[1]); + w2[2] = swap32_S (w2[2]); + w2[3] = swap32_S (w2[3]); + w3[0] = swap32_S (w3[0]); + w3[1] = swap32_S (w3[1]); + w3[2] = swap32_S (w3[2]); + w3[3] = swap32_S (w3[3]); + w4[0] = swap32_S (w4[0]); + w4[1] = swap32_S (w4[1]); + w4[2] = swap32_S (w4[2]); + w4[3] = swap32_S (w4[3]); + w5[0] = swap32_S (w5[0]); + w5[1] = swap32_S (w5[1]); + w5[2] = swap32_S (w5[2]); + w5[3] = swap32_S (w5[3]); + w6[0] = swap32_S (w6[0]); + w6[1] = swap32_S (w6[1]); + w6[2] = swap32_S (w6[2]); + w6[3] = swap32_S (w6[3]); + w7[0] = swap32_S (w7[0]); + w7[1] = swap32_S (w7[1]); + w7[2] = swap32_S (w7[2]); + w7[3] = swap32_S (w7[3]); + switch (offset / 4) { - case 0: - w7[3] = amd_bytealign_S (w7[3], w7[2], offset_minus_4); - w7[2] = amd_bytealign_S (w7[2], w7[1], offset_minus_4); - w7[1] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); - w7[0] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); - w6[3] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w6[2] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w6[1] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w6[0] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w5[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w5[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w5[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w5[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w4[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w4[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w4[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w4[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w3[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w3[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - - if (offset_mod_4 == 0) - { - w0[0] = w0[1]; - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } + case 0: + w7[3] = amd_bytealign_S (w7[2], w7[3], offset); + w7[2] = amd_bytealign_S (w7[1], w7[2], offset); + w7[1] = amd_bytealign_S (w7[0], w7[1], offset); + w7[0] = amd_bytealign_S (w6[3], w7[0], offset); + w6[3] = amd_bytealign_S (w6[2], w6[3], offset); + w6[2] = amd_bytealign_S (w6[1], w6[2], offset); + w6[1] = amd_bytealign_S (w6[0], w6[1], offset); + w6[0] = amd_bytealign_S (w5[3], w6[0], offset); + w5[3] = amd_bytealign_S (w5[2], w5[3], offset); + w5[2] = amd_bytealign_S (w5[1], w5[2], offset); + w5[1] = amd_bytealign_S (w5[0], w5[1], offset); + w5[0] = amd_bytealign_S (w4[3], w5[0], offset); + w4[3] = amd_bytealign_S (w4[2], w4[3], offset); + w4[2] = amd_bytealign_S (w4[1], w4[2], offset); + w4[1] = amd_bytealign_S (w4[0], w4[1], offset); + w4[0] = amd_bytealign_S (w3[3], w4[0], offset); + w3[3] = amd_bytealign_S (w3[2], w3[3], offset); + w3[2] = amd_bytealign_S (w3[1], w3[2], offset); + w3[1] = amd_bytealign_S (w3[0], w3[1], offset); + w3[0] = amd_bytealign_S (w2[3], w3[0], offset); + w2[3] = amd_bytealign_S (w2[2], w2[3], offset); + w2[2] = amd_bytealign_S (w2[1], w2[2], offset); + w2[1] = amd_bytealign_S (w2[0], w2[1], offset); + w2[0] = amd_bytealign_S (w1[3], w2[0], offset); + w1[3] = amd_bytealign_S (w1[2], w1[3], offset); + w1[2] = amd_bytealign_S (w1[1], w1[2], offset); + w1[1] = amd_bytealign_S (w1[0], w1[1], offset); + w1[0] = amd_bytealign_S (w0[3], w1[0], offset); + w0[3] = amd_bytealign_S (w0[2], w0[3], offset); + w0[2] = amd_bytealign_S (w0[1], w0[2], offset); + w0[1] = amd_bytealign_S (w0[0], w0[1], offset); + w0[0] = amd_bytealign_S ( 0, w0[0], offset); break; - case 1: - w7[3] = amd_bytealign_S (w7[2], w7[1], offset_minus_4); - w7[2] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); - w7[1] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); - w7[0] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w6[3] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w6[2] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w6[1] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w6[0] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w5[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w5[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w5[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w5[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w4[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w4[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w4[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w4[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w3[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 1: + w7[3] = amd_bytealign_S (w7[1], w7[2], offset); + w7[2] = amd_bytealign_S (w7[0], w7[1], offset); + w7[1] = amd_bytealign_S (w6[3], w7[0], offset); + w7[0] = amd_bytealign_S (w6[2], w6[3], offset); + w6[3] = amd_bytealign_S (w6[1], w6[2], offset); + w6[2] = amd_bytealign_S (w6[0], w6[1], offset); + w6[1] = amd_bytealign_S (w5[3], w6[0], offset); + w6[0] = amd_bytealign_S (w5[2], w5[3], offset); + w5[3] = amd_bytealign_S (w5[1], w5[2], offset); + w5[2] = amd_bytealign_S (w5[0], w5[1], offset); + w5[1] = amd_bytealign_S (w4[3], w5[0], offset); + w5[0] = amd_bytealign_S (w4[2], w4[3], offset); + w4[3] = amd_bytealign_S (w4[1], w4[2], offset); + w4[2] = amd_bytealign_S (w4[0], w4[1], offset); + w4[1] = amd_bytealign_S (w3[3], w4[0], offset); + w4[0] = amd_bytealign_S (w3[2], w3[3], offset); + w3[3] = amd_bytealign_S (w3[1], w3[2], offset); + w3[2] = amd_bytealign_S (w3[0], w3[1], offset); + w3[1] = amd_bytealign_S (w2[3], w3[0], offset); + w3[0] = amd_bytealign_S (w2[2], w2[3], offset); + w2[3] = amd_bytealign_S (w2[1], w2[2], offset); + w2[2] = amd_bytealign_S (w2[0], w2[1], offset); + w2[1] = amd_bytealign_S (w1[3], w2[0], offset); + w2[0] = amd_bytealign_S (w1[2], w1[3], offset); + w1[3] = amd_bytealign_S (w1[1], w1[2], offset); + w1[2] = amd_bytealign_S (w1[0], w1[1], offset); + w1[1] = amd_bytealign_S (w0[3], w1[0], offset); + w1[0] = amd_bytealign_S (w0[2], w0[3], offset); + w0[3] = amd_bytealign_S (w0[1], w0[2], offset); + w0[2] = amd_bytealign_S (w0[0], w0[1], offset); + w0[1] = amd_bytealign_S ( 0, w0[0], offset); w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 2: - w7[3] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); - w7[2] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); - w7[1] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w7[0] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w6[3] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w6[2] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w6[1] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w6[0] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w5[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w5[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w5[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w5[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w4[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w4[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w4[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w4[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 2: + w7[3] = amd_bytealign_S (w7[0], w7[1], offset); + w7[2] = amd_bytealign_S (w6[3], w7[0], offset); + w7[1] = amd_bytealign_S (w6[2], w6[3], offset); + w7[0] = amd_bytealign_S (w6[1], w6[2], offset); + w6[3] = amd_bytealign_S (w6[0], w6[1], offset); + w6[2] = amd_bytealign_S (w5[3], w6[0], offset); + w6[1] = amd_bytealign_S (w5[2], w5[3], offset); + w6[0] = amd_bytealign_S (w5[1], w5[2], offset); + w5[3] = amd_bytealign_S (w5[0], w5[1], offset); + w5[2] = amd_bytealign_S (w4[3], w5[0], offset); + w5[1] = amd_bytealign_S (w4[2], w4[3], offset); + w5[0] = amd_bytealign_S (w4[1], w4[2], offset); + w4[3] = amd_bytealign_S (w4[0], w4[1], offset); + w4[2] = amd_bytealign_S (w3[3], w4[0], offset); + w4[1] = amd_bytealign_S (w3[2], w3[3], offset); + w4[0] = amd_bytealign_S (w3[1], w3[2], offset); + w3[3] = amd_bytealign_S (w3[0], w3[1], offset); + w3[2] = amd_bytealign_S (w2[3], w3[0], offset); + w3[1] = amd_bytealign_S (w2[2], w2[3], offset); + w3[0] = amd_bytealign_S (w2[1], w2[2], offset); + w2[3] = amd_bytealign_S (w2[0], w2[1], offset); + w2[2] = amd_bytealign_S (w1[3], w2[0], offset); + w2[1] = amd_bytealign_S (w1[2], w1[3], offset); + w2[0] = amd_bytealign_S (w1[1], w1[2], offset); + w1[3] = amd_bytealign_S (w1[0], w1[1], offset); + w1[2] = amd_bytealign_S (w0[3], w1[0], offset); + w1[1] = amd_bytealign_S (w0[2], w0[3], offset); + w1[0] = amd_bytealign_S (w0[1], w0[2], offset); + w0[3] = amd_bytealign_S (w0[0], w0[1], offset); + w0[2] = amd_bytealign_S ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 3: - w7[3] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); - w7[2] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w7[1] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w7[0] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w6[3] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w6[2] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w6[1] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w6[0] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w5[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w5[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w5[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w5[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w4[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w4[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w4[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w4[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 3: + w7[3] = amd_bytealign_S (w6[3], w7[0], offset); + w7[2] = amd_bytealign_S (w6[2], w6[3], offset); + w7[1] = amd_bytealign_S (w6[1], w6[2], offset); + w7[0] = amd_bytealign_S (w6[0], w6[1], offset); + w6[3] = amd_bytealign_S (w5[3], w6[0], offset); + w6[2] = amd_bytealign_S (w5[2], w5[3], offset); + w6[1] = amd_bytealign_S (w5[1], w5[2], offset); + w6[0] = amd_bytealign_S (w5[0], w5[1], offset); + w5[3] = amd_bytealign_S (w4[3], w5[0], offset); + w5[2] = amd_bytealign_S (w4[2], w4[3], offset); + w5[1] = amd_bytealign_S (w4[1], w4[2], offset); + w5[0] = amd_bytealign_S (w4[0], w4[1], offset); + w4[3] = amd_bytealign_S (w3[3], w4[0], offset); + w4[2] = amd_bytealign_S (w3[2], w3[3], offset); + w4[1] = amd_bytealign_S (w3[1], w3[2], offset); + w4[0] = amd_bytealign_S (w3[0], w3[1], offset); + w3[3] = amd_bytealign_S (w2[3], w3[0], offset); + w3[2] = amd_bytealign_S (w2[2], w2[3], offset); + w3[1] = amd_bytealign_S (w2[1], w2[2], offset); + w3[0] = amd_bytealign_S (w2[0], w2[1], offset); + w2[3] = amd_bytealign_S (w1[3], w2[0], offset); + w2[2] = amd_bytealign_S (w1[2], w1[3], offset); + w2[1] = amd_bytealign_S (w1[1], w1[2], offset); + w2[0] = amd_bytealign_S (w1[0], w1[1], offset); + w1[3] = amd_bytealign_S (w0[3], w1[0], offset); + w1[2] = amd_bytealign_S (w0[2], w0[3], offset); + w1[1] = amd_bytealign_S (w0[1], w0[2], offset); + w1[0] = amd_bytealign_S (w0[0], w0[1], offset); + w0[3] = amd_bytealign_S ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 4: - w7[3] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w7[2] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w7[1] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w7[0] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w6[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w6[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w6[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w6[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w5[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w5[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w5[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w5[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w4[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w4[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w4[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w4[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 4: + w7[3] = amd_bytealign_S (w6[2], w6[3], offset); + w7[2] = amd_bytealign_S (w6[1], w6[2], offset); + w7[1] = amd_bytealign_S (w6[0], w6[1], offset); + w7[0] = amd_bytealign_S (w5[3], w6[0], offset); + w6[3] = amd_bytealign_S (w5[2], w5[3], offset); + w6[2] = amd_bytealign_S (w5[1], w5[2], offset); + w6[1] = amd_bytealign_S (w5[0], w5[1], offset); + w6[0] = amd_bytealign_S (w4[3], w5[0], offset); + w5[3] = amd_bytealign_S (w4[2], w4[3], offset); + w5[2] = amd_bytealign_S (w4[1], w4[2], offset); + w5[1] = amd_bytealign_S (w4[0], w4[1], offset); + w5[0] = amd_bytealign_S (w3[3], w4[0], offset); + w4[3] = amd_bytealign_S (w3[2], w3[3], offset); + w4[2] = amd_bytealign_S (w3[1], w3[2], offset); + w4[1] = amd_bytealign_S (w3[0], w3[1], offset); + w4[0] = amd_bytealign_S (w2[3], w3[0], offset); + w3[3] = amd_bytealign_S (w2[2], w2[3], offset); + w3[2] = amd_bytealign_S (w2[1], w2[2], offset); + w3[1] = amd_bytealign_S (w2[0], w2[1], offset); + w3[0] = amd_bytealign_S (w1[3], w2[0], offset); + w2[3] = amd_bytealign_S (w1[2], w1[3], offset); + w2[2] = amd_bytealign_S (w1[1], w1[2], offset); + w2[1] = amd_bytealign_S (w1[0], w1[1], offset); + w2[0] = amd_bytealign_S (w0[3], w1[0], offset); + w1[3] = amd_bytealign_S (w0[2], w0[3], offset); + w1[2] = amd_bytealign_S (w0[1], w0[2], offset); + w1[1] = amd_bytealign_S (w0[0], w0[1], offset); + w1[0] = amd_bytealign_S ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 5: - w7[3] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w7[2] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w7[1] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w7[0] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w6[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w6[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w6[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w6[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w5[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w5[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w5[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w5[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w4[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w4[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w4[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w4[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 5: + w7[3] = amd_bytealign_S (w6[1], w6[2], offset); + w7[2] = amd_bytealign_S (w6[0], w6[1], offset); + w7[1] = amd_bytealign_S (w5[3], w6[0], offset); + w7[0] = amd_bytealign_S (w5[2], w5[3], offset); + w6[3] = amd_bytealign_S (w5[1], w5[2], offset); + w6[2] = amd_bytealign_S (w5[0], w5[1], offset); + w6[1] = amd_bytealign_S (w4[3], w5[0], offset); + w6[0] = amd_bytealign_S (w4[2], w4[3], offset); + w5[3] = amd_bytealign_S (w4[1], w4[2], offset); + w5[2] = amd_bytealign_S (w4[0], w4[1], offset); + w5[1] = amd_bytealign_S (w3[3], w4[0], offset); + w5[0] = amd_bytealign_S (w3[2], w3[3], offset); + w4[3] = amd_bytealign_S (w3[1], w3[2], offset); + w4[2] = amd_bytealign_S (w3[0], w3[1], offset); + w4[1] = amd_bytealign_S (w2[3], w3[0], offset); + w4[0] = amd_bytealign_S (w2[2], w2[3], offset); + w3[3] = amd_bytealign_S (w2[1], w2[2], offset); + w3[2] = amd_bytealign_S (w2[0], w2[1], offset); + w3[1] = amd_bytealign_S (w1[3], w2[0], offset); + w3[0] = amd_bytealign_S (w1[2], w1[3], offset); + w2[3] = amd_bytealign_S (w1[1], w1[2], offset); + w2[2] = amd_bytealign_S (w1[0], w1[1], offset); + w2[1] = amd_bytealign_S (w0[3], w1[0], offset); + w2[0] = amd_bytealign_S (w0[2], w0[3], offset); + w1[3] = amd_bytealign_S (w0[1], w0[2], offset); + w1[2] = amd_bytealign_S (w0[0], w0[1], offset); + w1[1] = amd_bytealign_S ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 6: - w7[3] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w7[2] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w7[1] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w7[0] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w6[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w6[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w6[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w6[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w5[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w5[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w5[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w5[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w4[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w4[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w4[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w4[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 6: + w7[3] = amd_bytealign_S (w6[0], w6[1], offset); + w7[2] = amd_bytealign_S (w5[3], w6[0], offset); + w7[1] = amd_bytealign_S (w5[2], w5[3], offset); + w7[0] = amd_bytealign_S (w5[1], w5[2], offset); + w6[3] = amd_bytealign_S (w5[0], w5[1], offset); + w6[2] = amd_bytealign_S (w4[3], w5[0], offset); + w6[1] = amd_bytealign_S (w4[2], w4[3], offset); + w6[0] = amd_bytealign_S (w4[1], w4[2], offset); + w5[3] = amd_bytealign_S (w4[0], w4[1], offset); + w5[2] = amd_bytealign_S (w3[3], w4[0], offset); + w5[1] = amd_bytealign_S (w3[2], w3[3], offset); + w5[0] = amd_bytealign_S (w3[1], w3[2], offset); + w4[3] = amd_bytealign_S (w3[0], w3[1], offset); + w4[2] = amd_bytealign_S (w2[3], w3[0], offset); + w4[1] = amd_bytealign_S (w2[2], w2[3], offset); + w4[0] = amd_bytealign_S (w2[1], w2[2], offset); + w3[3] = amd_bytealign_S (w2[0], w2[1], offset); + w3[2] = amd_bytealign_S (w1[3], w2[0], offset); + w3[1] = amd_bytealign_S (w1[2], w1[3], offset); + w3[0] = amd_bytealign_S (w1[1], w1[2], offset); + w2[3] = amd_bytealign_S (w1[0], w1[1], offset); + w2[2] = amd_bytealign_S (w0[3], w1[0], offset); + w2[1] = amd_bytealign_S (w0[2], w0[3], offset); + w2[0] = amd_bytealign_S (w0[1], w0[2], offset); + w1[3] = amd_bytealign_S (w0[0], w0[1], offset); + w1[2] = amd_bytealign_S ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -18741,64 +36905,34 @@ void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 7: - w7[3] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w7[2] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w7[1] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w7[0] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w6[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w6[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w6[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w6[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w5[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w5[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w5[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w5[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w4[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w4[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w4[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w4[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 7: + w7[3] = amd_bytealign_S (w5[3], w6[0], offset); + w7[2] = amd_bytealign_S (w5[2], w5[3], offset); + w7[1] = amd_bytealign_S (w5[1], w5[2], offset); + w7[0] = amd_bytealign_S (w5[0], w5[1], offset); + w6[3] = amd_bytealign_S (w4[3], w5[0], offset); + w6[2] = amd_bytealign_S (w4[2], w4[3], offset); + w6[1] = amd_bytealign_S (w4[1], w4[2], offset); + w6[0] = amd_bytealign_S (w4[0], w4[1], offset); + w5[3] = amd_bytealign_S (w3[3], w4[0], offset); + w5[2] = amd_bytealign_S (w3[2], w3[3], offset); + w5[1] = amd_bytealign_S (w3[1], w3[2], offset); + w5[0] = amd_bytealign_S (w3[0], w3[1], offset); + w4[3] = amd_bytealign_S (w2[3], w3[0], offset); + w4[2] = amd_bytealign_S (w2[2], w2[3], offset); + w4[1] = amd_bytealign_S (w2[1], w2[2], offset); + w4[0] = amd_bytealign_S (w2[0], w2[1], offset); + w3[3] = amd_bytealign_S (w1[3], w2[0], offset); + w3[2] = amd_bytealign_S (w1[2], w1[3], offset); + w3[1] = amd_bytealign_S (w1[1], w1[2], offset); + w3[0] = amd_bytealign_S (w1[0], w1[1], offset); + w2[3] = amd_bytealign_S (w0[3], w1[0], offset); + w2[2] = amd_bytealign_S (w0[2], w0[3], offset); + w2[1] = amd_bytealign_S (w0[1], w0[2], offset); + w2[0] = amd_bytealign_S (w0[0], w0[1], offset); + w1[3] = amd_bytealign_S ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -18807,62 +36941,33 @@ void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 8: - w7[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w7[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w7[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w7[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w6[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w6[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w6[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w6[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w5[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w5[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w5[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w5[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w4[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w4[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w4[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w4[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 8: + w7[3] = amd_bytealign_S (w5[2], w5[3], offset); + w7[2] = amd_bytealign_S (w5[1], w5[2], offset); + w7[1] = amd_bytealign_S (w5[0], w5[1], offset); + w7[0] = amd_bytealign_S (w4[3], w5[0], offset); + w6[3] = amd_bytealign_S (w4[2], w4[3], offset); + w6[2] = amd_bytealign_S (w4[1], w4[2], offset); + w6[1] = amd_bytealign_S (w4[0], w4[1], offset); + w6[0] = amd_bytealign_S (w3[3], w4[0], offset); + w5[3] = amd_bytealign_S (w3[2], w3[3], offset); + w5[2] = amd_bytealign_S (w3[1], w3[2], offset); + w5[1] = amd_bytealign_S (w3[0], w3[1], offset); + w5[0] = amd_bytealign_S (w2[3], w3[0], offset); + w4[3] = amd_bytealign_S (w2[2], w2[3], offset); + w4[2] = amd_bytealign_S (w2[1], w2[2], offset); + w4[1] = amd_bytealign_S (w2[0], w2[1], offset); + w4[0] = amd_bytealign_S (w1[3], w2[0], offset); + w3[3] = amd_bytealign_S (w1[2], w1[3], offset); + w3[2] = amd_bytealign_S (w1[1], w1[2], offset); + w3[1] = amd_bytealign_S (w1[0], w1[1], offset); + w3[0] = amd_bytealign_S (w0[3], w1[0], offset); + w2[3] = amd_bytealign_S (w0[2], w0[3], offset); + w2[2] = amd_bytealign_S (w0[1], w0[2], offset); + w2[1] = amd_bytealign_S (w0[0], w0[1], offset); + w2[0] = amd_bytealign_S ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -18872,60 +36977,32 @@ void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 9: - w7[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w7[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w7[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w7[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w6[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w6[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w6[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w6[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w5[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w5[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w5[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w5[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w4[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w4[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w4[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w4[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 9: + w7[3] = amd_bytealign_S (w5[1], w5[2], offset); + w7[2] = amd_bytealign_S (w5[0], w5[1], offset); + w7[1] = amd_bytealign_S (w4[3], w5[0], offset); + w7[0] = amd_bytealign_S (w4[2], w4[3], offset); + w6[3] = amd_bytealign_S (w4[1], w4[2], offset); + w6[2] = amd_bytealign_S (w4[0], w4[1], offset); + w6[1] = amd_bytealign_S (w3[3], w4[0], offset); + w6[0] = amd_bytealign_S (w3[2], w3[3], offset); + w5[3] = amd_bytealign_S (w3[1], w3[2], offset); + w5[2] = amd_bytealign_S (w3[0], w3[1], offset); + w5[1] = amd_bytealign_S (w2[3], w3[0], offset); + w5[0] = amd_bytealign_S (w2[2], w2[3], offset); + w4[3] = amd_bytealign_S (w2[1], w2[2], offset); + w4[2] = amd_bytealign_S (w2[0], w2[1], offset); + w4[1] = amd_bytealign_S (w1[3], w2[0], offset); + w4[0] = amd_bytealign_S (w1[2], w1[3], offset); + w3[3] = amd_bytealign_S (w1[1], w1[2], offset); + w3[2] = amd_bytealign_S (w1[0], w1[1], offset); + w3[1] = amd_bytealign_S (w0[3], w1[0], offset); + w3[0] = amd_bytealign_S (w0[2], w0[3], offset); + w2[3] = amd_bytealign_S (w0[1], w0[2], offset); + w2[2] = amd_bytealign_S (w0[0], w0[1], offset); + w2[1] = amd_bytealign_S ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -18936,58 +37013,31 @@ void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 10: - w7[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w7[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w7[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w7[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w6[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w6[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w6[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w6[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w5[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w5[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w5[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w5[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w4[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w4[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w4[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w4[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign_S (w5[0], w5[1], offset); + w7[2] = amd_bytealign_S (w4[3], w5[0], offset); + w7[1] = amd_bytealign_S (w4[2], w4[3], offset); + w7[0] = amd_bytealign_S (w4[1], w4[2], offset); + w6[3] = amd_bytealign_S (w4[0], w4[1], offset); + w6[2] = amd_bytealign_S (w3[3], w4[0], offset); + w6[1] = amd_bytealign_S (w3[2], w3[3], offset); + w6[0] = amd_bytealign_S (w3[1], w3[2], offset); + w5[3] = amd_bytealign_S (w3[0], w3[1], offset); + w5[2] = amd_bytealign_S (w2[3], w3[0], offset); + w5[1] = amd_bytealign_S (w2[2], w2[3], offset); + w5[0] = amd_bytealign_S (w2[1], w2[2], offset); + w4[3] = amd_bytealign_S (w2[0], w2[1], offset); + w4[2] = amd_bytealign_S (w1[3], w2[0], offset); + w4[1] = amd_bytealign_S (w1[2], w1[3], offset); + w4[0] = amd_bytealign_S (w1[1], w1[2], offset); + w3[3] = amd_bytealign_S (w1[0], w1[1], offset); + w3[2] = amd_bytealign_S (w0[3], w1[0], offset); + w3[1] = amd_bytealign_S (w0[2], w0[3], offset); + w3[0] = amd_bytealign_S (w0[1], w0[2], offset); + w2[3] = amd_bytealign_S (w0[0], w0[1], offset); + w2[2] = amd_bytealign_S ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -18999,56 +37049,30 @@ void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 11: - w7[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w7[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w7[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w7[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w6[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w6[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w6[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w6[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w5[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w5[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w5[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w5[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w4[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w4[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w4[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w4[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign_S (w4[3], w5[0], offset); + w7[2] = amd_bytealign_S (w4[2], w4[3], offset); + w7[1] = amd_bytealign_S (w4[1], w4[2], offset); + w7[0] = amd_bytealign_S (w4[0], w4[1], offset); + w6[3] = amd_bytealign_S (w3[3], w4[0], offset); + w6[2] = amd_bytealign_S (w3[2], w3[3], offset); + w6[1] = amd_bytealign_S (w3[1], w3[2], offset); + w6[0] = amd_bytealign_S (w3[0], w3[1], offset); + w5[3] = amd_bytealign_S (w2[3], w3[0], offset); + w5[2] = amd_bytealign_S (w2[2], w2[3], offset); + w5[1] = amd_bytealign_S (w2[1], w2[2], offset); + w5[0] = amd_bytealign_S (w2[0], w2[1], offset); + w4[3] = amd_bytealign_S (w1[3], w2[0], offset); + w4[2] = amd_bytealign_S (w1[2], w1[3], offset); + w4[1] = amd_bytealign_S (w1[1], w1[2], offset); + w4[0] = amd_bytealign_S (w1[0], w1[1], offset); + w3[3] = amd_bytealign_S (w0[3], w1[0], offset); + w3[2] = amd_bytealign_S (w0[2], w0[3], offset); + w3[1] = amd_bytealign_S (w0[1], w0[2], offset); + w3[0] = amd_bytealign_S (w0[0], w0[1], offset); + w2[3] = amd_bytealign_S ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -19061,54 +37085,29 @@ void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 12: - w7[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w7[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w7[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w7[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w6[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w6[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w6[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w6[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w5[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w5[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w5[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w5[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w4[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w4[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w4[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w4[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign_S (w4[2], w4[3], offset); + w7[2] = amd_bytealign_S (w4[1], w4[2], offset); + w7[1] = amd_bytealign_S (w4[0], w4[1], offset); + w7[0] = amd_bytealign_S (w3[3], w4[0], offset); + w6[3] = amd_bytealign_S (w3[2], w3[3], offset); + w6[2] = amd_bytealign_S (w3[1], w3[2], offset); + w6[1] = amd_bytealign_S (w3[0], w3[1], offset); + w6[0] = amd_bytealign_S (w2[3], w3[0], offset); + w5[3] = amd_bytealign_S (w2[2], w2[3], offset); + w5[2] = amd_bytealign_S (w2[1], w2[2], offset); + w5[1] = amd_bytealign_S (w2[0], w2[1], offset); + w5[0] = amd_bytealign_S (w1[3], w2[0], offset); + w4[3] = amd_bytealign_S (w1[2], w1[3], offset); + w4[2] = amd_bytealign_S (w1[1], w1[2], offset); + w4[1] = amd_bytealign_S (w1[0], w1[1], offset); + w4[0] = amd_bytealign_S (w0[3], w1[0], offset); + w3[3] = amd_bytealign_S (w0[2], w0[3], offset); + w3[2] = amd_bytealign_S (w0[1], w0[2], offset); + w3[1] = amd_bytealign_S (w0[0], w0[1], offset); + w3[0] = amd_bytealign_S ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -19122,52 +37121,28 @@ void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 13: - w7[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w7[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w7[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w7[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w6[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w6[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w6[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w6[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w5[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w5[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w5[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w5[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w4[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w4[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w4[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w4[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign_S (w4[1], w4[2], offset); + w7[2] = amd_bytealign_S (w4[0], w4[1], offset); + w7[1] = amd_bytealign_S (w3[3], w4[0], offset); + w7[0] = amd_bytealign_S (w3[2], w3[3], offset); + w6[3] = amd_bytealign_S (w3[1], w3[2], offset); + w6[2] = amd_bytealign_S (w3[0], w3[1], offset); + w6[1] = amd_bytealign_S (w2[3], w3[0], offset); + w6[0] = amd_bytealign_S (w2[2], w2[3], offset); + w5[3] = amd_bytealign_S (w2[1], w2[2], offset); + w5[2] = amd_bytealign_S (w2[0], w2[1], offset); + w5[1] = amd_bytealign_S (w1[3], w2[0], offset); + w5[0] = amd_bytealign_S (w1[2], w1[3], offset); + w4[3] = amd_bytealign_S (w1[1], w1[2], offset); + w4[2] = amd_bytealign_S (w1[0], w1[1], offset); + w4[1] = amd_bytealign_S (w0[3], w1[0], offset); + w4[0] = amd_bytealign_S (w0[2], w0[3], offset); + w3[3] = amd_bytealign_S (w0[1], w0[2], offset); + w3[2] = amd_bytealign_S (w0[0], w0[1], offset); + w3[1] = amd_bytealign_S ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -19182,50 +37157,27 @@ void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 14: - w7[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w7[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w7[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w7[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w6[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w6[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w6[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w6[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w5[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w5[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w5[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w5[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w4[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w4[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w4[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w4[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign_S (w4[0], w4[1], offset); + w7[2] = amd_bytealign_S (w3[3], w4[0], offset); + w7[1] = amd_bytealign_S (w3[2], w3[3], offset); + w7[0] = amd_bytealign_S (w3[1], w3[2], offset); + w6[3] = amd_bytealign_S (w3[0], w3[1], offset); + w6[2] = amd_bytealign_S (w2[3], w3[0], offset); + w6[1] = amd_bytealign_S (w2[2], w2[3], offset); + w6[0] = amd_bytealign_S (w2[1], w2[2], offset); + w5[3] = amd_bytealign_S (w2[0], w2[1], offset); + w5[2] = amd_bytealign_S (w1[3], w2[0], offset); + w5[1] = amd_bytealign_S (w1[2], w1[3], offset); + w5[0] = amd_bytealign_S (w1[1], w1[2], offset); + w4[3] = amd_bytealign_S (w1[0], w1[1], offset); + w4[2] = amd_bytealign_S (w0[3], w1[0], offset); + w4[1] = amd_bytealign_S (w0[2], w0[3], offset); + w4[0] = amd_bytealign_S (w0[1], w0[2], offset); + w3[3] = amd_bytealign_S (w0[0], w0[1], offset); + w3[2] = amd_bytealign_S ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -19241,48 +37193,26 @@ void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 15: - w7[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w7[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w7[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w7[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w6[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w6[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w6[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w6[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w5[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w5[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w5[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w5[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w4[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w4[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w4[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w4[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign_S (w3[3], w4[0], offset); + w7[2] = amd_bytealign_S (w3[2], w3[3], offset); + w7[1] = amd_bytealign_S (w3[1], w3[2], offset); + w7[0] = amd_bytealign_S (w3[0], w3[1], offset); + w6[3] = amd_bytealign_S (w2[3], w3[0], offset); + w6[2] = amd_bytealign_S (w2[2], w2[3], offset); + w6[1] = amd_bytealign_S (w2[1], w2[2], offset); + w6[0] = amd_bytealign_S (w2[0], w2[1], offset); + w5[3] = amd_bytealign_S (w1[3], w2[0], offset); + w5[2] = amd_bytealign_S (w1[2], w1[3], offset); + w5[1] = amd_bytealign_S (w1[1], w1[2], offset); + w5[0] = amd_bytealign_S (w1[0], w1[1], offset); + w4[3] = amd_bytealign_S (w0[3], w1[0], offset); + w4[2] = amd_bytealign_S (w0[2], w0[3], offset); + w4[1] = amd_bytealign_S (w0[1], w0[2], offset); + w4[0] = amd_bytealign_S (w0[0], w0[1], offset); + w3[3] = amd_bytealign_S ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -19299,29 +37229,617 @@ void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } + break; + + case 16: + w7[3] = amd_bytealign_S (w3[2], w3[3], offset); + w7[2] = amd_bytealign_S (w3[1], w3[2], offset); + w7[1] = amd_bytealign_S (w3[0], w3[1], offset); + w7[0] = amd_bytealign_S (w2[3], w3[0], offset); + w6[3] = amd_bytealign_S (w2[2], w2[3], offset); + w6[2] = amd_bytealign_S (w2[1], w2[2], offset); + w6[1] = amd_bytealign_S (w2[0], w2[1], offset); + w6[0] = amd_bytealign_S (w1[3], w2[0], offset); + w5[3] = amd_bytealign_S (w1[2], w1[3], offset); + w5[2] = amd_bytealign_S (w1[1], w1[2], offset); + w5[1] = amd_bytealign_S (w1[0], w1[1], offset); + w5[0] = amd_bytealign_S (w0[3], w1[0], offset); + w4[3] = amd_bytealign_S (w0[2], w0[3], offset); + w4[2] = amd_bytealign_S (w0[1], w0[2], offset); + w4[1] = amd_bytealign_S (w0[0], w0[1], offset); + w4[0] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 17: + w7[3] = amd_bytealign_S (w3[1], w3[2], offset); + w7[2] = amd_bytealign_S (w3[0], w3[1], offset); + w7[1] = amd_bytealign_S (w2[3], w3[0], offset); + w7[0] = amd_bytealign_S (w2[2], w2[3], offset); + w6[3] = amd_bytealign_S (w2[1], w2[2], offset); + w6[2] = amd_bytealign_S (w2[0], w2[1], offset); + w6[1] = amd_bytealign_S (w1[3], w2[0], offset); + w6[0] = amd_bytealign_S (w1[2], w1[3], offset); + w5[3] = amd_bytealign_S (w1[1], w1[2], offset); + w5[2] = amd_bytealign_S (w1[0], w1[1], offset); + w5[1] = amd_bytealign_S (w0[3], w1[0], offset); + w5[0] = amd_bytealign_S (w0[2], w0[3], offset); + w4[3] = amd_bytealign_S (w0[1], w0[2], offset); + w4[2] = amd_bytealign_S (w0[0], w0[1], offset); + w4[1] = amd_bytealign_S ( 0, w0[0], offset); + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 18: + w7[3] = amd_bytealign_S (w3[0], w3[1], offset); + w7[2] = amd_bytealign_S (w2[3], w3[0], offset); + w7[1] = amd_bytealign_S (w2[2], w2[3], offset); + w7[0] = amd_bytealign_S (w2[1], w2[2], offset); + w6[3] = amd_bytealign_S (w2[0], w2[1], offset); + w6[2] = amd_bytealign_S (w1[3], w2[0], offset); + w6[1] = amd_bytealign_S (w1[2], w1[3], offset); + w6[0] = amd_bytealign_S (w1[1], w1[2], offset); + w5[3] = amd_bytealign_S (w1[0], w1[1], offset); + w5[2] = amd_bytealign_S (w0[3], w1[0], offset); + w5[1] = amd_bytealign_S (w0[2], w0[3], offset); + w5[0] = amd_bytealign_S (w0[1], w0[2], offset); + w4[3] = amd_bytealign_S (w0[0], w0[1], offset); + w4[2] = amd_bytealign_S ( 0, w0[0], offset); + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 19: + w7[3] = amd_bytealign_S (w2[3], w3[0], offset); + w7[2] = amd_bytealign_S (w2[2], w2[3], offset); + w7[1] = amd_bytealign_S (w2[1], w2[2], offset); + w7[0] = amd_bytealign_S (w2[0], w2[1], offset); + w6[3] = amd_bytealign_S (w1[3], w2[0], offset); + w6[2] = amd_bytealign_S (w1[2], w1[3], offset); + w6[1] = amd_bytealign_S (w1[1], w1[2], offset); + w6[0] = amd_bytealign_S (w1[0], w1[1], offset); + w5[3] = amd_bytealign_S (w0[3], w1[0], offset); + w5[2] = amd_bytealign_S (w0[2], w0[3], offset); + w5[1] = amd_bytealign_S (w0[1], w0[2], offset); + w5[0] = amd_bytealign_S (w0[0], w0[1], offset); + w4[3] = amd_bytealign_S ( 0, w0[0], offset); + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 20: + w7[3] = amd_bytealign_S (w2[2], w2[3], offset); + w7[2] = amd_bytealign_S (w2[1], w2[2], offset); + w7[1] = amd_bytealign_S (w2[0], w2[1], offset); + w7[0] = amd_bytealign_S (w1[3], w2[0], offset); + w6[3] = amd_bytealign_S (w1[2], w1[3], offset); + w6[2] = amd_bytealign_S (w1[1], w1[2], offset); + w6[1] = amd_bytealign_S (w1[0], w1[1], offset); + w6[0] = amd_bytealign_S (w0[3], w1[0], offset); + w5[3] = amd_bytealign_S (w0[2], w0[3], offset); + w5[2] = amd_bytealign_S (w0[1], w0[2], offset); + w5[1] = amd_bytealign_S (w0[0], w0[1], offset); + w5[0] = amd_bytealign_S ( 0, w0[0], offset); + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 21: + w7[3] = amd_bytealign_S (w2[1], w2[2], offset); + w7[2] = amd_bytealign_S (w2[0], w2[1], offset); + w7[1] = amd_bytealign_S (w1[3], w2[0], offset); + w7[0] = amd_bytealign_S (w1[2], w1[3], offset); + w6[3] = amd_bytealign_S (w1[1], w1[2], offset); + w6[2] = amd_bytealign_S (w1[0], w1[1], offset); + w6[1] = amd_bytealign_S (w0[3], w1[0], offset); + w6[0] = amd_bytealign_S (w0[2], w0[3], offset); + w5[3] = amd_bytealign_S (w0[1], w0[2], offset); + w5[2] = amd_bytealign_S (w0[0], w0[1], offset); + w5[1] = amd_bytealign_S ( 0, w0[0], offset); + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 22: + w7[3] = amd_bytealign_S (w2[0], w2[1], offset); + w7[2] = amd_bytealign_S (w1[3], w2[0], offset); + w7[1] = amd_bytealign_S (w1[2], w1[3], offset); + w7[0] = amd_bytealign_S (w1[1], w1[2], offset); + w6[3] = amd_bytealign_S (w1[0], w1[1], offset); + w6[2] = amd_bytealign_S (w0[3], w1[0], offset); + w6[1] = amd_bytealign_S (w0[2], w0[3], offset); + w6[0] = amd_bytealign_S (w0[1], w0[2], offset); + w5[3] = amd_bytealign_S (w0[0], w0[1], offset); + w5[2] = amd_bytealign_S ( 0, w0[0], offset); + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 23: + w7[3] = amd_bytealign_S (w1[3], w2[0], offset); + w7[2] = amd_bytealign_S (w1[2], w1[3], offset); + w7[1] = amd_bytealign_S (w1[1], w1[2], offset); + w7[0] = amd_bytealign_S (w1[0], w1[1], offset); + w6[3] = amd_bytealign_S (w0[3], w1[0], offset); + w6[2] = amd_bytealign_S (w0[2], w0[3], offset); + w6[1] = amd_bytealign_S (w0[1], w0[2], offset); + w6[0] = amd_bytealign_S (w0[0], w0[1], offset); + w5[3] = amd_bytealign_S ( 0, w0[0], offset); + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 24: + w7[3] = amd_bytealign_S (w1[2], w1[3], offset); + w7[2] = amd_bytealign_S (w1[1], w1[2], offset); + w7[1] = amd_bytealign_S (w1[0], w1[1], offset); + w7[0] = amd_bytealign_S (w0[3], w1[0], offset); + w6[3] = amd_bytealign_S (w0[2], w0[3], offset); + w6[2] = amd_bytealign_S (w0[1], w0[2], offset); + w6[1] = amd_bytealign_S (w0[0], w0[1], offset); + w6[0] = amd_bytealign_S ( 0, w0[0], offset); + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 25: + w7[3] = amd_bytealign_S (w1[1], w1[2], offset); + w7[2] = amd_bytealign_S (w1[0], w1[1], offset); + w7[1] = amd_bytealign_S (w0[3], w1[0], offset); + w7[0] = amd_bytealign_S (w0[2], w0[3], offset); + w6[3] = amd_bytealign_S (w0[1], w0[2], offset); + w6[2] = amd_bytealign_S (w0[0], w0[1], offset); + w6[1] = amd_bytealign_S ( 0, w0[0], offset); + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 26: + w7[3] = amd_bytealign_S (w1[0], w1[1], offset); + w7[2] = amd_bytealign_S (w0[3], w1[0], offset); + w7[1] = amd_bytealign_S (w0[2], w0[3], offset); + w7[0] = amd_bytealign_S (w0[1], w0[2], offset); + w6[3] = amd_bytealign_S (w0[0], w0[1], offset); + w6[2] = amd_bytealign_S ( 0, w0[0], offset); + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 27: + w7[3] = amd_bytealign_S (w0[3], w1[0], offset); + w7[2] = amd_bytealign_S (w0[2], w0[3], offset); + w7[1] = amd_bytealign_S (w0[1], w0[2], offset); + w7[0] = amd_bytealign_S (w0[0], w0[1], offset); + w6[3] = amd_bytealign_S ( 0, w0[0], offset); + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 28: + w7[3] = amd_bytealign_S (w0[2], w0[3], offset); + w7[2] = amd_bytealign_S (w0[1], w0[2], offset); + w7[1] = amd_bytealign_S (w0[0], w0[1], offset); + w7[0] = amd_bytealign_S ( 0, w0[0], offset); + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 29: + w7[3] = amd_bytealign_S (w0[1], w0[2], offset); + w7[2] = amd_bytealign_S (w0[0], w0[1], offset); + w7[1] = amd_bytealign_S ( 0, w0[0], offset); + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 30: + w7[3] = amd_bytealign_S (w0[0], w0[1], offset); + w7[2] = amd_bytealign_S ( 0, w0[0], offset); + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 31: + w7[3] = amd_bytealign_S ( 0, w0[0], offset); + w7[2] = 0; + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; } + + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); + w1[1] = swap32_S (w1[1]); + w1[2] = swap32_S (w1[2]); + w1[3] = swap32_S (w1[3]); + w2[0] = swap32_S (w2[0]); + w2[1] = swap32_S (w2[1]); + w2[2] = swap32_S (w2[2]); + w2[3] = swap32_S (w2[3]); + w3[0] = swap32_S (w3[0]); + w3[1] = swap32_S (w3[1]); + w3[2] = swap32_S (w3[2]); + w3[3] = swap32_S (w3[3]); + w4[0] = swap32_S (w4[0]); + w4[1] = swap32_S (w4[1]); + w4[2] = swap32_S (w4[2]); + w4[3] = swap32_S (w4[3]); + w5[0] = swap32_S (w5[0]); + w5[1] = swap32_S (w5[1]); + w5[2] = swap32_S (w5[2]); + w5[3] = swap32_S (w5[3]); + w6[0] = swap32_S (w6[0]); + w6[1] = swap32_S (w6[1]); + w6[2] = swap32_S (w6[2]); + w6[3] = swap32_S (w6[3]); + w7[0] = swap32_S (w7[0]); + w7[1] = swap32_S (w7[1]); + w7[2] = swap32_S (w7[2]); + w7[3] = swap32_S (w7[3]); #endif #ifdef IS_NV @@ -25593,868 +44111,479 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC + + #pragma unroll + for (int i = 0; i < 64; i++) w[i] = swap32_S (w[i]); + switch (offset / 4) { case 0: - w[63] = amd_bytealign_S (w[63], w[62], offset_minus_4); - w[62] = amd_bytealign_S (w[62], w[61], offset_minus_4); - w[61] = amd_bytealign_S (w[61], w[60], offset_minus_4); - w[60] = amd_bytealign_S (w[60], w[59], offset_minus_4); - w[59] = amd_bytealign_S (w[59], w[58], offset_minus_4); - w[58] = amd_bytealign_S (w[58], w[57], offset_minus_4); - w[57] = amd_bytealign_S (w[57], w[56], offset_minus_4); - w[56] = amd_bytealign_S (w[56], w[55], offset_minus_4); - w[55] = amd_bytealign_S (w[55], w[54], offset_minus_4); - w[54] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[53] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[52] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[51] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[50] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[49] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[48] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[47] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[46] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[45] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[44] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[43] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[42] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[41] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[40] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[39] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[38] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[37] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[36] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[35] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[34] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[33] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[32] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[31] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[30] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[29] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[28] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[27] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[26] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[25] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[24] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[23] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[22] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[21] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[20] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[19] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[18] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[17] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[16] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[15] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[14] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[13] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[12] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[11] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[10] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[ 8] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[ 7] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[ 6] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[ 5] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[ 4] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[ 3] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[ 2] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[ 1] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 0] = amd_bytealign_S (w[ 0], 0, offset_minus_4); - - if (offset_mod_4 == 0) - { - w[ 0] = w[ 1]; - w[ 1] = w[ 2]; - w[ 2] = w[ 3]; - w[ 3] = w[ 4]; - w[ 4] = w[ 5]; - w[ 5] = w[ 6]; - w[ 6] = w[ 7]; - w[ 7] = w[ 8]; - w[ 8] = w[ 9]; - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } + w[63] = amd_bytealign_S (w[62], w[63], offset); + w[62] = amd_bytealign_S (w[61], w[62], offset); + w[61] = amd_bytealign_S (w[60], w[61], offset); + w[60] = amd_bytealign_S (w[59], w[60], offset); + w[59] = amd_bytealign_S (w[58], w[59], offset); + w[58] = amd_bytealign_S (w[57], w[58], offset); + w[57] = amd_bytealign_S (w[56], w[57], offset); + w[56] = amd_bytealign_S (w[55], w[56], offset); + w[55] = amd_bytealign_S (w[54], w[55], offset); + w[54] = amd_bytealign_S (w[53], w[54], offset); + w[53] = amd_bytealign_S (w[52], w[53], offset); + w[52] = amd_bytealign_S (w[51], w[52], offset); + w[51] = amd_bytealign_S (w[50], w[51], offset); + w[50] = amd_bytealign_S (w[49], w[50], offset); + w[49] = amd_bytealign_S (w[48], w[49], offset); + w[48] = amd_bytealign_S (w[47], w[48], offset); + w[47] = amd_bytealign_S (w[46], w[47], offset); + w[46] = amd_bytealign_S (w[45], w[46], offset); + w[45] = amd_bytealign_S (w[44], w[45], offset); + w[44] = amd_bytealign_S (w[43], w[44], offset); + w[43] = amd_bytealign_S (w[42], w[43], offset); + w[42] = amd_bytealign_S (w[41], w[42], offset); + w[41] = amd_bytealign_S (w[40], w[41], offset); + w[40] = amd_bytealign_S (w[39], w[40], offset); + w[39] = amd_bytealign_S (w[38], w[39], offset); + w[38] = amd_bytealign_S (w[37], w[38], offset); + w[37] = amd_bytealign_S (w[36], w[37], offset); + w[36] = amd_bytealign_S (w[35], w[36], offset); + w[35] = amd_bytealign_S (w[34], w[35], offset); + w[34] = amd_bytealign_S (w[33], w[34], offset); + w[33] = amd_bytealign_S (w[32], w[33], offset); + w[32] = amd_bytealign_S (w[31], w[32], offset); + w[31] = amd_bytealign_S (w[30], w[31], offset); + w[30] = amd_bytealign_S (w[29], w[30], offset); + w[29] = amd_bytealign_S (w[28], w[29], offset); + w[28] = amd_bytealign_S (w[27], w[28], offset); + w[27] = amd_bytealign_S (w[26], w[27], offset); + w[26] = amd_bytealign_S (w[25], w[26], offset); + w[25] = amd_bytealign_S (w[24], w[25], offset); + w[24] = amd_bytealign_S (w[23], w[24], offset); + w[23] = amd_bytealign_S (w[22], w[23], offset); + w[22] = amd_bytealign_S (w[21], w[22], offset); + w[21] = amd_bytealign_S (w[20], w[21], offset); + w[20] = amd_bytealign_S (w[19], w[20], offset); + w[19] = amd_bytealign_S (w[18], w[19], offset); + w[18] = amd_bytealign_S (w[17], w[18], offset); + w[17] = amd_bytealign_S (w[16], w[17], offset); + w[16] = amd_bytealign_S (w[15], w[16], offset); + w[15] = amd_bytealign_S (w[14], w[15], offset); + w[14] = amd_bytealign_S (w[13], w[14], offset); + w[13] = amd_bytealign_S (w[12], w[13], offset); + w[12] = amd_bytealign_S (w[11], w[12], offset); + w[11] = amd_bytealign_S (w[10], w[11], offset); + w[10] = amd_bytealign_S (w[ 9], w[10], offset); + w[ 9] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[ 8] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[ 7] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[ 6] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[ 5] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[ 4] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[ 3] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 2] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 1] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 0] = amd_bytealign_S ( 0, w[ 0], offset); break; case 1: - w[63] = amd_bytealign_S (w[62], w[61], offset_minus_4); - w[62] = amd_bytealign_S (w[61], w[60], offset_minus_4); - w[61] = amd_bytealign_S (w[60], w[59], offset_minus_4); - w[60] = amd_bytealign_S (w[59], w[58], offset_minus_4); - w[59] = amd_bytealign_S (w[58], w[57], offset_minus_4); - w[58] = amd_bytealign_S (w[57], w[56], offset_minus_4); - w[57] = amd_bytealign_S (w[56], w[55], offset_minus_4); - w[56] = amd_bytealign_S (w[55], w[54], offset_minus_4); - w[55] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[54] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[53] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[52] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[51] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[50] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[49] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[48] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[47] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[46] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[45] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[44] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[43] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[42] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[41] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[40] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[39] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[38] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[37] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[36] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[35] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[34] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[33] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[32] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[31] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[30] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[29] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[28] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[27] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[26] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[25] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[24] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[23] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[22] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[21] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[20] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[19] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[18] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[17] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[16] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[15] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[14] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[13] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[12] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[11] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[10] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[ 8] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[ 7] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[ 6] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[ 5] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[ 4] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[ 3] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[ 2] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 1] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[61], w[62], offset); + w[62] = amd_bytealign_S (w[60], w[61], offset); + w[61] = amd_bytealign_S (w[59], w[60], offset); + w[60] = amd_bytealign_S (w[58], w[59], offset); + w[59] = amd_bytealign_S (w[57], w[58], offset); + w[58] = amd_bytealign_S (w[56], w[57], offset); + w[57] = amd_bytealign_S (w[55], w[56], offset); + w[56] = amd_bytealign_S (w[54], w[55], offset); + w[55] = amd_bytealign_S (w[53], w[54], offset); + w[54] = amd_bytealign_S (w[52], w[53], offset); + w[53] = amd_bytealign_S (w[51], w[52], offset); + w[52] = amd_bytealign_S (w[50], w[51], offset); + w[51] = amd_bytealign_S (w[49], w[50], offset); + w[50] = amd_bytealign_S (w[48], w[49], offset); + w[49] = amd_bytealign_S (w[47], w[48], offset); + w[48] = amd_bytealign_S (w[46], w[47], offset); + w[47] = amd_bytealign_S (w[45], w[46], offset); + w[46] = amd_bytealign_S (w[44], w[45], offset); + w[45] = amd_bytealign_S (w[43], w[44], offset); + w[44] = amd_bytealign_S (w[42], w[43], offset); + w[43] = amd_bytealign_S (w[41], w[42], offset); + w[42] = amd_bytealign_S (w[40], w[41], offset); + w[41] = amd_bytealign_S (w[39], w[40], offset); + w[40] = amd_bytealign_S (w[38], w[39], offset); + w[39] = amd_bytealign_S (w[37], w[38], offset); + w[38] = amd_bytealign_S (w[36], w[37], offset); + w[37] = amd_bytealign_S (w[35], w[36], offset); + w[36] = amd_bytealign_S (w[34], w[35], offset); + w[35] = amd_bytealign_S (w[33], w[34], offset); + w[34] = amd_bytealign_S (w[32], w[33], offset); + w[33] = amd_bytealign_S (w[31], w[32], offset); + w[32] = amd_bytealign_S (w[30], w[31], offset); + w[31] = amd_bytealign_S (w[29], w[30], offset); + w[30] = amd_bytealign_S (w[28], w[29], offset); + w[29] = amd_bytealign_S (w[27], w[28], offset); + w[28] = amd_bytealign_S (w[26], w[27], offset); + w[27] = amd_bytealign_S (w[25], w[26], offset); + w[26] = amd_bytealign_S (w[24], w[25], offset); + w[25] = amd_bytealign_S (w[23], w[24], offset); + w[24] = amd_bytealign_S (w[22], w[23], offset); + w[23] = amd_bytealign_S (w[21], w[22], offset); + w[22] = amd_bytealign_S (w[20], w[21], offset); + w[21] = amd_bytealign_S (w[19], w[20], offset); + w[20] = amd_bytealign_S (w[18], w[19], offset); + w[19] = amd_bytealign_S (w[17], w[18], offset); + w[18] = amd_bytealign_S (w[16], w[17], offset); + w[17] = amd_bytealign_S (w[15], w[16], offset); + w[16] = amd_bytealign_S (w[14], w[15], offset); + w[15] = amd_bytealign_S (w[13], w[14], offset); + w[14] = amd_bytealign_S (w[12], w[13], offset); + w[13] = amd_bytealign_S (w[11], w[12], offset); + w[12] = amd_bytealign_S (w[10], w[11], offset); + w[11] = amd_bytealign_S (w[ 9], w[10], offset); + w[10] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[ 9] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[ 8] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[ 7] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[ 6] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[ 5] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[ 4] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 3] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 2] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 1] = amd_bytealign_S ( 0, w[ 0], offset); w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[ 1] = w[ 2]; - w[ 2] = w[ 3]; - w[ 3] = w[ 4]; - w[ 4] = w[ 5]; - w[ 5] = w[ 6]; - w[ 6] = w[ 7]; - w[ 7] = w[ 8]; - w[ 8] = w[ 9]; - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 2: - w[63] = amd_bytealign_S (w[61], w[60], offset_minus_4); - w[62] = amd_bytealign_S (w[60], w[59], offset_minus_4); - w[61] = amd_bytealign_S (w[59], w[58], offset_minus_4); - w[60] = amd_bytealign_S (w[58], w[57], offset_minus_4); - w[59] = amd_bytealign_S (w[57], w[56], offset_minus_4); - w[58] = amd_bytealign_S (w[56], w[55], offset_minus_4); - w[57] = amd_bytealign_S (w[55], w[54], offset_minus_4); - w[56] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[55] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[54] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[53] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[52] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[51] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[50] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[49] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[48] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[47] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[46] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[45] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[44] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[43] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[42] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[41] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[40] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[39] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[38] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[37] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[36] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[35] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[34] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[33] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[32] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[31] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[30] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[29] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[28] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[27] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[26] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[25] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[24] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[23] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[22] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[21] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[20] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[19] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[18] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[17] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[16] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[15] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[14] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[13] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[12] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[11] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[10] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[ 8] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[ 7] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[ 6] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[ 5] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[ 4] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[ 3] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 2] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[60], w[61], offset); + w[62] = amd_bytealign_S (w[59], w[60], offset); + w[61] = amd_bytealign_S (w[58], w[59], offset); + w[60] = amd_bytealign_S (w[57], w[58], offset); + w[59] = amd_bytealign_S (w[56], w[57], offset); + w[58] = amd_bytealign_S (w[55], w[56], offset); + w[57] = amd_bytealign_S (w[54], w[55], offset); + w[56] = amd_bytealign_S (w[53], w[54], offset); + w[55] = amd_bytealign_S (w[52], w[53], offset); + w[54] = amd_bytealign_S (w[51], w[52], offset); + w[53] = amd_bytealign_S (w[50], w[51], offset); + w[52] = amd_bytealign_S (w[49], w[50], offset); + w[51] = amd_bytealign_S (w[48], w[49], offset); + w[50] = amd_bytealign_S (w[47], w[48], offset); + w[49] = amd_bytealign_S (w[46], w[47], offset); + w[48] = amd_bytealign_S (w[45], w[46], offset); + w[47] = amd_bytealign_S (w[44], w[45], offset); + w[46] = amd_bytealign_S (w[43], w[44], offset); + w[45] = amd_bytealign_S (w[42], w[43], offset); + w[44] = amd_bytealign_S (w[41], w[42], offset); + w[43] = amd_bytealign_S (w[40], w[41], offset); + w[42] = amd_bytealign_S (w[39], w[40], offset); + w[41] = amd_bytealign_S (w[38], w[39], offset); + w[40] = amd_bytealign_S (w[37], w[38], offset); + w[39] = amd_bytealign_S (w[36], w[37], offset); + w[38] = amd_bytealign_S (w[35], w[36], offset); + w[37] = amd_bytealign_S (w[34], w[35], offset); + w[36] = amd_bytealign_S (w[33], w[34], offset); + w[35] = amd_bytealign_S (w[32], w[33], offset); + w[34] = amd_bytealign_S (w[31], w[32], offset); + w[33] = amd_bytealign_S (w[30], w[31], offset); + w[32] = amd_bytealign_S (w[29], w[30], offset); + w[31] = amd_bytealign_S (w[28], w[29], offset); + w[30] = amd_bytealign_S (w[27], w[28], offset); + w[29] = amd_bytealign_S (w[26], w[27], offset); + w[28] = amd_bytealign_S (w[25], w[26], offset); + w[27] = amd_bytealign_S (w[24], w[25], offset); + w[26] = amd_bytealign_S (w[23], w[24], offset); + w[25] = amd_bytealign_S (w[22], w[23], offset); + w[24] = amd_bytealign_S (w[21], w[22], offset); + w[23] = amd_bytealign_S (w[20], w[21], offset); + w[22] = amd_bytealign_S (w[19], w[20], offset); + w[21] = amd_bytealign_S (w[18], w[19], offset); + w[20] = amd_bytealign_S (w[17], w[18], offset); + w[19] = amd_bytealign_S (w[16], w[17], offset); + w[18] = amd_bytealign_S (w[15], w[16], offset); + w[17] = amd_bytealign_S (w[14], w[15], offset); + w[16] = amd_bytealign_S (w[13], w[14], offset); + w[15] = amd_bytealign_S (w[12], w[13], offset); + w[14] = amd_bytealign_S (w[11], w[12], offset); + w[13] = amd_bytealign_S (w[10], w[11], offset); + w[12] = amd_bytealign_S (w[ 9], w[10], offset); + w[11] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[10] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[ 9] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[ 8] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[ 7] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[ 6] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[ 5] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 4] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 3] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 2] = amd_bytealign_S ( 0, w[ 0], offset); w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[ 2] = w[ 3]; - w[ 3] = w[ 4]; - w[ 4] = w[ 5]; - w[ 5] = w[ 6]; - w[ 6] = w[ 7]; - w[ 7] = w[ 8]; - w[ 8] = w[ 9]; - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 3: - w[63] = amd_bytealign_S (w[60], w[59], offset_minus_4); - w[62] = amd_bytealign_S (w[59], w[58], offset_minus_4); - w[61] = amd_bytealign_S (w[58], w[57], offset_minus_4); - w[60] = amd_bytealign_S (w[57], w[56], offset_minus_4); - w[59] = amd_bytealign_S (w[56], w[55], offset_minus_4); - w[58] = amd_bytealign_S (w[55], w[54], offset_minus_4); - w[57] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[56] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[55] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[54] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[53] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[52] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[51] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[50] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[49] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[48] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[47] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[46] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[45] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[44] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[43] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[42] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[41] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[40] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[39] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[38] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[37] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[36] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[35] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[34] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[33] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[32] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[31] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[30] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[29] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[28] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[27] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[26] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[25] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[24] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[23] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[22] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[21] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[20] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[19] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[18] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[17] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[16] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[15] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[14] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[13] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[12] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[11] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[10] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[ 8] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[ 7] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[ 6] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[ 5] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[ 4] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 3] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[59], w[60], offset); + w[62] = amd_bytealign_S (w[58], w[59], offset); + w[61] = amd_bytealign_S (w[57], w[58], offset); + w[60] = amd_bytealign_S (w[56], w[57], offset); + w[59] = amd_bytealign_S (w[55], w[56], offset); + w[58] = amd_bytealign_S (w[54], w[55], offset); + w[57] = amd_bytealign_S (w[53], w[54], offset); + w[56] = amd_bytealign_S (w[52], w[53], offset); + w[55] = amd_bytealign_S (w[51], w[52], offset); + w[54] = amd_bytealign_S (w[50], w[51], offset); + w[53] = amd_bytealign_S (w[49], w[50], offset); + w[52] = amd_bytealign_S (w[48], w[49], offset); + w[51] = amd_bytealign_S (w[47], w[48], offset); + w[50] = amd_bytealign_S (w[46], w[47], offset); + w[49] = amd_bytealign_S (w[45], w[46], offset); + w[48] = amd_bytealign_S (w[44], w[45], offset); + w[47] = amd_bytealign_S (w[43], w[44], offset); + w[46] = amd_bytealign_S (w[42], w[43], offset); + w[45] = amd_bytealign_S (w[41], w[42], offset); + w[44] = amd_bytealign_S (w[40], w[41], offset); + w[43] = amd_bytealign_S (w[39], w[40], offset); + w[42] = amd_bytealign_S (w[38], w[39], offset); + w[41] = amd_bytealign_S (w[37], w[38], offset); + w[40] = amd_bytealign_S (w[36], w[37], offset); + w[39] = amd_bytealign_S (w[35], w[36], offset); + w[38] = amd_bytealign_S (w[34], w[35], offset); + w[37] = amd_bytealign_S (w[33], w[34], offset); + w[36] = amd_bytealign_S (w[32], w[33], offset); + w[35] = amd_bytealign_S (w[31], w[32], offset); + w[34] = amd_bytealign_S (w[30], w[31], offset); + w[33] = amd_bytealign_S (w[29], w[30], offset); + w[32] = amd_bytealign_S (w[28], w[29], offset); + w[31] = amd_bytealign_S (w[27], w[28], offset); + w[30] = amd_bytealign_S (w[26], w[27], offset); + w[29] = amd_bytealign_S (w[25], w[26], offset); + w[28] = amd_bytealign_S (w[24], w[25], offset); + w[27] = amd_bytealign_S (w[23], w[24], offset); + w[26] = amd_bytealign_S (w[22], w[23], offset); + w[25] = amd_bytealign_S (w[21], w[22], offset); + w[24] = amd_bytealign_S (w[20], w[21], offset); + w[23] = amd_bytealign_S (w[19], w[20], offset); + w[22] = amd_bytealign_S (w[18], w[19], offset); + w[21] = amd_bytealign_S (w[17], w[18], offset); + w[20] = amd_bytealign_S (w[16], w[17], offset); + w[19] = amd_bytealign_S (w[15], w[16], offset); + w[18] = amd_bytealign_S (w[14], w[15], offset); + w[17] = amd_bytealign_S (w[13], w[14], offset); + w[16] = amd_bytealign_S (w[12], w[13], offset); + w[15] = amd_bytealign_S (w[11], w[12], offset); + w[14] = amd_bytealign_S (w[10], w[11], offset); + w[13] = amd_bytealign_S (w[ 9], w[10], offset); + w[12] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[11] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[10] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[ 9] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[ 8] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[ 7] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[ 6] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 5] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 4] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 3] = amd_bytealign_S ( 0, w[ 0], offset); w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[ 3] = w[ 4]; - w[ 4] = w[ 5]; - w[ 5] = w[ 6]; - w[ 6] = w[ 7]; - w[ 7] = w[ 8]; - w[ 8] = w[ 9]; - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 4: - w[63] = amd_bytealign_S (w[59], w[58], offset_minus_4); - w[62] = amd_bytealign_S (w[58], w[57], offset_minus_4); - w[61] = amd_bytealign_S (w[57], w[56], offset_minus_4); - w[60] = amd_bytealign_S (w[56], w[55], offset_minus_4); - w[59] = amd_bytealign_S (w[55], w[54], offset_minus_4); - w[58] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[57] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[56] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[55] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[54] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[53] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[52] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[51] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[50] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[49] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[48] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[47] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[46] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[45] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[44] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[43] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[42] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[41] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[40] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[39] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[38] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[37] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[36] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[35] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[34] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[33] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[32] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[31] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[30] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[29] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[28] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[27] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[26] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[25] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[24] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[23] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[22] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[21] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[20] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[19] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[18] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[17] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[16] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[15] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[14] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[13] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[12] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[11] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[10] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[ 8] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[ 7] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[ 6] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[ 5] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 4] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[58], w[59], offset); + w[62] = amd_bytealign_S (w[57], w[58], offset); + w[61] = amd_bytealign_S (w[56], w[57], offset); + w[60] = amd_bytealign_S (w[55], w[56], offset); + w[59] = amd_bytealign_S (w[54], w[55], offset); + w[58] = amd_bytealign_S (w[53], w[54], offset); + w[57] = amd_bytealign_S (w[52], w[53], offset); + w[56] = amd_bytealign_S (w[51], w[52], offset); + w[55] = amd_bytealign_S (w[50], w[51], offset); + w[54] = amd_bytealign_S (w[49], w[50], offset); + w[53] = amd_bytealign_S (w[48], w[49], offset); + w[52] = amd_bytealign_S (w[47], w[48], offset); + w[51] = amd_bytealign_S (w[46], w[47], offset); + w[50] = amd_bytealign_S (w[45], w[46], offset); + w[49] = amd_bytealign_S (w[44], w[45], offset); + w[48] = amd_bytealign_S (w[43], w[44], offset); + w[47] = amd_bytealign_S (w[42], w[43], offset); + w[46] = amd_bytealign_S (w[41], w[42], offset); + w[45] = amd_bytealign_S (w[40], w[41], offset); + w[44] = amd_bytealign_S (w[39], w[40], offset); + w[43] = amd_bytealign_S (w[38], w[39], offset); + w[42] = amd_bytealign_S (w[37], w[38], offset); + w[41] = amd_bytealign_S (w[36], w[37], offset); + w[40] = amd_bytealign_S (w[35], w[36], offset); + w[39] = amd_bytealign_S (w[34], w[35], offset); + w[38] = amd_bytealign_S (w[33], w[34], offset); + w[37] = amd_bytealign_S (w[32], w[33], offset); + w[36] = amd_bytealign_S (w[31], w[32], offset); + w[35] = amd_bytealign_S (w[30], w[31], offset); + w[34] = amd_bytealign_S (w[29], w[30], offset); + w[33] = amd_bytealign_S (w[28], w[29], offset); + w[32] = amd_bytealign_S (w[27], w[28], offset); + w[31] = amd_bytealign_S (w[26], w[27], offset); + w[30] = amd_bytealign_S (w[25], w[26], offset); + w[29] = amd_bytealign_S (w[24], w[25], offset); + w[28] = amd_bytealign_S (w[23], w[24], offset); + w[27] = amd_bytealign_S (w[22], w[23], offset); + w[26] = amd_bytealign_S (w[21], w[22], offset); + w[25] = amd_bytealign_S (w[20], w[21], offset); + w[24] = amd_bytealign_S (w[19], w[20], offset); + w[23] = amd_bytealign_S (w[18], w[19], offset); + w[22] = amd_bytealign_S (w[17], w[18], offset); + w[21] = amd_bytealign_S (w[16], w[17], offset); + w[20] = amd_bytealign_S (w[15], w[16], offset); + w[19] = amd_bytealign_S (w[14], w[15], offset); + w[18] = amd_bytealign_S (w[13], w[14], offset); + w[17] = amd_bytealign_S (w[12], w[13], offset); + w[16] = amd_bytealign_S (w[11], w[12], offset); + w[15] = amd_bytealign_S (w[10], w[11], offset); + w[14] = amd_bytealign_S (w[ 9], w[10], offset); + w[13] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[12] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[11] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[10] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[ 9] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[ 8] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[ 7] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 6] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 5] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 4] = amd_bytealign_S ( 0, w[ 0], offset); w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[ 4] = w[ 5]; - w[ 5] = w[ 6]; - w[ 6] = w[ 7]; - w[ 7] = w[ 8]; - w[ 8] = w[ 9]; - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 5: - w[63] = amd_bytealign_S (w[58], w[57], offset_minus_4); - w[62] = amd_bytealign_S (w[57], w[56], offset_minus_4); - w[61] = amd_bytealign_S (w[56], w[55], offset_minus_4); - w[60] = amd_bytealign_S (w[55], w[54], offset_minus_4); - w[59] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[58] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[57] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[56] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[55] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[54] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[53] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[52] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[51] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[50] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[49] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[48] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[47] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[46] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[45] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[44] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[43] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[42] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[41] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[40] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[39] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[38] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[37] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[36] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[35] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[34] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[33] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[32] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[31] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[30] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[29] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[28] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[27] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[26] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[25] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[24] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[23] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[22] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[21] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[20] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[19] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[18] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[17] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[16] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[15] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[14] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[13] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[12] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[11] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[10] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[ 8] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[ 7] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[ 6] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 5] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[57], w[58], offset); + w[62] = amd_bytealign_S (w[56], w[57], offset); + w[61] = amd_bytealign_S (w[55], w[56], offset); + w[60] = amd_bytealign_S (w[54], w[55], offset); + w[59] = amd_bytealign_S (w[53], w[54], offset); + w[58] = amd_bytealign_S (w[52], w[53], offset); + w[57] = amd_bytealign_S (w[51], w[52], offset); + w[56] = amd_bytealign_S (w[50], w[51], offset); + w[55] = amd_bytealign_S (w[49], w[50], offset); + w[54] = amd_bytealign_S (w[48], w[49], offset); + w[53] = amd_bytealign_S (w[47], w[48], offset); + w[52] = amd_bytealign_S (w[46], w[47], offset); + w[51] = amd_bytealign_S (w[45], w[46], offset); + w[50] = amd_bytealign_S (w[44], w[45], offset); + w[49] = amd_bytealign_S (w[43], w[44], offset); + w[48] = amd_bytealign_S (w[42], w[43], offset); + w[47] = amd_bytealign_S (w[41], w[42], offset); + w[46] = amd_bytealign_S (w[40], w[41], offset); + w[45] = amd_bytealign_S (w[39], w[40], offset); + w[44] = amd_bytealign_S (w[38], w[39], offset); + w[43] = amd_bytealign_S (w[37], w[38], offset); + w[42] = amd_bytealign_S (w[36], w[37], offset); + w[41] = amd_bytealign_S (w[35], w[36], offset); + w[40] = amd_bytealign_S (w[34], w[35], offset); + w[39] = amd_bytealign_S (w[33], w[34], offset); + w[38] = amd_bytealign_S (w[32], w[33], offset); + w[37] = amd_bytealign_S (w[31], w[32], offset); + w[36] = amd_bytealign_S (w[30], w[31], offset); + w[35] = amd_bytealign_S (w[29], w[30], offset); + w[34] = amd_bytealign_S (w[28], w[29], offset); + w[33] = amd_bytealign_S (w[27], w[28], offset); + w[32] = amd_bytealign_S (w[26], w[27], offset); + w[31] = amd_bytealign_S (w[25], w[26], offset); + w[30] = amd_bytealign_S (w[24], w[25], offset); + w[29] = amd_bytealign_S (w[23], w[24], offset); + w[28] = amd_bytealign_S (w[22], w[23], offset); + w[27] = amd_bytealign_S (w[21], w[22], offset); + w[26] = amd_bytealign_S (w[20], w[21], offset); + w[25] = amd_bytealign_S (w[19], w[20], offset); + w[24] = amd_bytealign_S (w[18], w[19], offset); + w[23] = amd_bytealign_S (w[17], w[18], offset); + w[22] = amd_bytealign_S (w[16], w[17], offset); + w[21] = amd_bytealign_S (w[15], w[16], offset); + w[20] = amd_bytealign_S (w[14], w[15], offset); + w[19] = amd_bytealign_S (w[13], w[14], offset); + w[18] = amd_bytealign_S (w[12], w[13], offset); + w[17] = amd_bytealign_S (w[11], w[12], offset); + w[16] = amd_bytealign_S (w[10], w[11], offset); + w[15] = amd_bytealign_S (w[ 9], w[10], offset); + w[14] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[13] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[12] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[11] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[10] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[ 9] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[ 8] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 7] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 6] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 5] = amd_bytealign_S ( 0, w[ 0], offset); w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[ 5] = w[ 6]; - w[ 6] = w[ 7]; - w[ 7] = w[ 8]; - w[ 8] = w[ 9]; - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 6: - w[63] = amd_bytealign_S (w[57], w[56], offset_minus_4); - w[62] = amd_bytealign_S (w[56], w[55], offset_minus_4); - w[61] = amd_bytealign_S (w[55], w[54], offset_minus_4); - w[60] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[59] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[58] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[57] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[56] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[55] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[54] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[53] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[52] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[51] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[50] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[49] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[48] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[47] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[46] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[45] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[44] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[43] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[42] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[41] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[40] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[39] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[38] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[37] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[36] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[35] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[34] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[33] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[32] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[31] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[30] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[29] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[28] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[27] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[26] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[25] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[24] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[23] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[22] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[21] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[20] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[19] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[18] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[17] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[16] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[15] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[14] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[13] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[12] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[11] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[10] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[ 8] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[ 7] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 6] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[56], w[57], offset); + w[62] = amd_bytealign_S (w[55], w[56], offset); + w[61] = amd_bytealign_S (w[54], w[55], offset); + w[60] = amd_bytealign_S (w[53], w[54], offset); + w[59] = amd_bytealign_S (w[52], w[53], offset); + w[58] = amd_bytealign_S (w[51], w[52], offset); + w[57] = amd_bytealign_S (w[50], w[51], offset); + w[56] = amd_bytealign_S (w[49], w[50], offset); + w[55] = amd_bytealign_S (w[48], w[49], offset); + w[54] = amd_bytealign_S (w[47], w[48], offset); + w[53] = amd_bytealign_S (w[46], w[47], offset); + w[52] = amd_bytealign_S (w[45], w[46], offset); + w[51] = amd_bytealign_S (w[44], w[45], offset); + w[50] = amd_bytealign_S (w[43], w[44], offset); + w[49] = amd_bytealign_S (w[42], w[43], offset); + w[48] = amd_bytealign_S (w[41], w[42], offset); + w[47] = amd_bytealign_S (w[40], w[41], offset); + w[46] = amd_bytealign_S (w[39], w[40], offset); + w[45] = amd_bytealign_S (w[38], w[39], offset); + w[44] = amd_bytealign_S (w[37], w[38], offset); + w[43] = amd_bytealign_S (w[36], w[37], offset); + w[42] = amd_bytealign_S (w[35], w[36], offset); + w[41] = amd_bytealign_S (w[34], w[35], offset); + w[40] = amd_bytealign_S (w[33], w[34], offset); + w[39] = amd_bytealign_S (w[32], w[33], offset); + w[38] = amd_bytealign_S (w[31], w[32], offset); + w[37] = amd_bytealign_S (w[30], w[31], offset); + w[36] = amd_bytealign_S (w[29], w[30], offset); + w[35] = amd_bytealign_S (w[28], w[29], offset); + w[34] = amd_bytealign_S (w[27], w[28], offset); + w[33] = amd_bytealign_S (w[26], w[27], offset); + w[32] = amd_bytealign_S (w[25], w[26], offset); + w[31] = amd_bytealign_S (w[24], w[25], offset); + w[30] = amd_bytealign_S (w[23], w[24], offset); + w[29] = amd_bytealign_S (w[22], w[23], offset); + w[28] = amd_bytealign_S (w[21], w[22], offset); + w[27] = amd_bytealign_S (w[20], w[21], offset); + w[26] = amd_bytealign_S (w[19], w[20], offset); + w[25] = amd_bytealign_S (w[18], w[19], offset); + w[24] = amd_bytealign_S (w[17], w[18], offset); + w[23] = amd_bytealign_S (w[16], w[17], offset); + w[22] = amd_bytealign_S (w[15], w[16], offset); + w[21] = amd_bytealign_S (w[14], w[15], offset); + w[20] = amd_bytealign_S (w[13], w[14], offset); + w[19] = amd_bytealign_S (w[12], w[13], offset); + w[18] = amd_bytealign_S (w[11], w[12], offset); + w[17] = amd_bytealign_S (w[10], w[11], offset); + w[16] = amd_bytealign_S (w[ 9], w[10], offset); + w[15] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[14] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[13] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[12] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[11] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[10] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[ 9] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 8] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 7] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 6] = amd_bytealign_S ( 0, w[ 0], offset); w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; @@ -26462,128 +44591,66 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[ 6] = w[ 7]; - w[ 7] = w[ 8]; - w[ 8] = w[ 9]; - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 7: - w[63] = amd_bytealign_S (w[56], w[55], offset_minus_4); - w[62] = amd_bytealign_S (w[55], w[54], offset_minus_4); - w[61] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[60] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[59] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[58] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[57] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[56] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[55] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[54] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[53] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[52] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[51] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[50] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[49] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[48] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[47] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[46] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[45] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[44] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[43] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[42] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[41] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[40] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[39] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[38] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[37] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[36] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[35] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[34] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[33] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[32] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[31] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[30] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[29] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[28] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[27] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[26] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[25] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[24] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[23] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[22] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[21] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[20] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[19] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[18] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[17] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[16] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[15] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[14] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[13] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[12] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[11] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[10] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[ 8] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 7] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[55], w[56], offset); + w[62] = amd_bytealign_S (w[54], w[55], offset); + w[61] = amd_bytealign_S (w[53], w[54], offset); + w[60] = amd_bytealign_S (w[52], w[53], offset); + w[59] = amd_bytealign_S (w[51], w[52], offset); + w[58] = amd_bytealign_S (w[50], w[51], offset); + w[57] = amd_bytealign_S (w[49], w[50], offset); + w[56] = amd_bytealign_S (w[48], w[49], offset); + w[55] = amd_bytealign_S (w[47], w[48], offset); + w[54] = amd_bytealign_S (w[46], w[47], offset); + w[53] = amd_bytealign_S (w[45], w[46], offset); + w[52] = amd_bytealign_S (w[44], w[45], offset); + w[51] = amd_bytealign_S (w[43], w[44], offset); + w[50] = amd_bytealign_S (w[42], w[43], offset); + w[49] = amd_bytealign_S (w[41], w[42], offset); + w[48] = amd_bytealign_S (w[40], w[41], offset); + w[47] = amd_bytealign_S (w[39], w[40], offset); + w[46] = amd_bytealign_S (w[38], w[39], offset); + w[45] = amd_bytealign_S (w[37], w[38], offset); + w[44] = amd_bytealign_S (w[36], w[37], offset); + w[43] = amd_bytealign_S (w[35], w[36], offset); + w[42] = amd_bytealign_S (w[34], w[35], offset); + w[41] = amd_bytealign_S (w[33], w[34], offset); + w[40] = amd_bytealign_S (w[32], w[33], offset); + w[39] = amd_bytealign_S (w[31], w[32], offset); + w[38] = amd_bytealign_S (w[30], w[31], offset); + w[37] = amd_bytealign_S (w[29], w[30], offset); + w[36] = amd_bytealign_S (w[28], w[29], offset); + w[35] = amd_bytealign_S (w[27], w[28], offset); + w[34] = amd_bytealign_S (w[26], w[27], offset); + w[33] = amd_bytealign_S (w[25], w[26], offset); + w[32] = amd_bytealign_S (w[24], w[25], offset); + w[31] = amd_bytealign_S (w[23], w[24], offset); + w[30] = amd_bytealign_S (w[22], w[23], offset); + w[29] = amd_bytealign_S (w[21], w[22], offset); + w[28] = amd_bytealign_S (w[20], w[21], offset); + w[27] = amd_bytealign_S (w[19], w[20], offset); + w[26] = amd_bytealign_S (w[18], w[19], offset); + w[25] = amd_bytealign_S (w[17], w[18], offset); + w[24] = amd_bytealign_S (w[16], w[17], offset); + w[23] = amd_bytealign_S (w[15], w[16], offset); + w[22] = amd_bytealign_S (w[14], w[15], offset); + w[21] = amd_bytealign_S (w[13], w[14], offset); + w[20] = amd_bytealign_S (w[12], w[13], offset); + w[19] = amd_bytealign_S (w[11], w[12], offset); + w[18] = amd_bytealign_S (w[10], w[11], offset); + w[17] = amd_bytealign_S (w[ 9], w[10], offset); + w[16] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[15] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[14] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[13] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[12] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[11] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[10] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 9] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 8] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 7] = amd_bytealign_S ( 0, w[ 0], offset); w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; @@ -26592,126 +44659,65 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[ 7] = w[ 8]; - w[ 8] = w[ 9]; - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 8: - w[63] = amd_bytealign_S (w[55], w[54], offset_minus_4); - w[62] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[61] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[60] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[59] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[58] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[57] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[56] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[55] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[54] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[53] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[52] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[51] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[50] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[49] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[48] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[47] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[46] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[45] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[44] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[43] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[42] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[41] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[40] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[39] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[38] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[37] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[36] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[35] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[34] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[33] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[32] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[31] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[30] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[29] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[28] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[27] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[26] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[25] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[24] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[23] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[22] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[21] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[20] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[19] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[18] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[17] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[16] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[15] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[14] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[13] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[12] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[11] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[10] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 8] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[54], w[55], offset); + w[62] = amd_bytealign_S (w[53], w[54], offset); + w[61] = amd_bytealign_S (w[52], w[53], offset); + w[60] = amd_bytealign_S (w[51], w[52], offset); + w[59] = amd_bytealign_S (w[50], w[51], offset); + w[58] = amd_bytealign_S (w[49], w[50], offset); + w[57] = amd_bytealign_S (w[48], w[49], offset); + w[56] = amd_bytealign_S (w[47], w[48], offset); + w[55] = amd_bytealign_S (w[46], w[47], offset); + w[54] = amd_bytealign_S (w[45], w[46], offset); + w[53] = amd_bytealign_S (w[44], w[45], offset); + w[52] = amd_bytealign_S (w[43], w[44], offset); + w[51] = amd_bytealign_S (w[42], w[43], offset); + w[50] = amd_bytealign_S (w[41], w[42], offset); + w[49] = amd_bytealign_S (w[40], w[41], offset); + w[48] = amd_bytealign_S (w[39], w[40], offset); + w[47] = amd_bytealign_S (w[38], w[39], offset); + w[46] = amd_bytealign_S (w[37], w[38], offset); + w[45] = amd_bytealign_S (w[36], w[37], offset); + w[44] = amd_bytealign_S (w[35], w[36], offset); + w[43] = amd_bytealign_S (w[34], w[35], offset); + w[42] = amd_bytealign_S (w[33], w[34], offset); + w[41] = amd_bytealign_S (w[32], w[33], offset); + w[40] = amd_bytealign_S (w[31], w[32], offset); + w[39] = amd_bytealign_S (w[30], w[31], offset); + w[38] = amd_bytealign_S (w[29], w[30], offset); + w[37] = amd_bytealign_S (w[28], w[29], offset); + w[36] = amd_bytealign_S (w[27], w[28], offset); + w[35] = amd_bytealign_S (w[26], w[27], offset); + w[34] = amd_bytealign_S (w[25], w[26], offset); + w[33] = amd_bytealign_S (w[24], w[25], offset); + w[32] = amd_bytealign_S (w[23], w[24], offset); + w[31] = amd_bytealign_S (w[22], w[23], offset); + w[30] = amd_bytealign_S (w[21], w[22], offset); + w[29] = amd_bytealign_S (w[20], w[21], offset); + w[28] = amd_bytealign_S (w[19], w[20], offset); + w[27] = amd_bytealign_S (w[18], w[19], offset); + w[26] = amd_bytealign_S (w[17], w[18], offset); + w[25] = amd_bytealign_S (w[16], w[17], offset); + w[24] = amd_bytealign_S (w[15], w[16], offset); + w[23] = amd_bytealign_S (w[14], w[15], offset); + w[22] = amd_bytealign_S (w[13], w[14], offset); + w[21] = amd_bytealign_S (w[12], w[13], offset); + w[20] = amd_bytealign_S (w[11], w[12], offset); + w[19] = amd_bytealign_S (w[10], w[11], offset); + w[18] = amd_bytealign_S (w[ 9], w[10], offset); + w[17] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[16] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[15] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[14] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[13] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[12] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[11] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[10] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 9] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 8] = amd_bytealign_S ( 0, w[ 0], offset); w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; @@ -26721,124 +44727,64 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[ 8] = w[ 9]; - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 9: - w[63] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[62] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[61] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[60] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[59] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[58] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[57] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[56] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[55] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[54] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[53] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[52] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[51] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[50] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[49] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[48] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[47] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[46] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[45] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[44] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[43] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[42] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[41] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[40] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[39] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[38] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[37] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[36] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[35] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[34] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[33] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[32] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[31] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[30] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[29] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[28] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[27] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[26] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[25] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[24] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[23] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[22] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[21] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[20] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[19] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[18] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[17] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[16] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[15] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[14] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[13] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[12] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[11] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[10] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[53], w[54], offset); + w[62] = amd_bytealign_S (w[52], w[53], offset); + w[61] = amd_bytealign_S (w[51], w[52], offset); + w[60] = amd_bytealign_S (w[50], w[51], offset); + w[59] = amd_bytealign_S (w[49], w[50], offset); + w[58] = amd_bytealign_S (w[48], w[49], offset); + w[57] = amd_bytealign_S (w[47], w[48], offset); + w[56] = amd_bytealign_S (w[46], w[47], offset); + w[55] = amd_bytealign_S (w[45], w[46], offset); + w[54] = amd_bytealign_S (w[44], w[45], offset); + w[53] = amd_bytealign_S (w[43], w[44], offset); + w[52] = amd_bytealign_S (w[42], w[43], offset); + w[51] = amd_bytealign_S (w[41], w[42], offset); + w[50] = amd_bytealign_S (w[40], w[41], offset); + w[49] = amd_bytealign_S (w[39], w[40], offset); + w[48] = amd_bytealign_S (w[38], w[39], offset); + w[47] = amd_bytealign_S (w[37], w[38], offset); + w[46] = amd_bytealign_S (w[36], w[37], offset); + w[45] = amd_bytealign_S (w[35], w[36], offset); + w[44] = amd_bytealign_S (w[34], w[35], offset); + w[43] = amd_bytealign_S (w[33], w[34], offset); + w[42] = amd_bytealign_S (w[32], w[33], offset); + w[41] = amd_bytealign_S (w[31], w[32], offset); + w[40] = amd_bytealign_S (w[30], w[31], offset); + w[39] = amd_bytealign_S (w[29], w[30], offset); + w[38] = amd_bytealign_S (w[28], w[29], offset); + w[37] = amd_bytealign_S (w[27], w[28], offset); + w[36] = amd_bytealign_S (w[26], w[27], offset); + w[35] = amd_bytealign_S (w[25], w[26], offset); + w[34] = amd_bytealign_S (w[24], w[25], offset); + w[33] = amd_bytealign_S (w[23], w[24], offset); + w[32] = amd_bytealign_S (w[22], w[23], offset); + w[31] = amd_bytealign_S (w[21], w[22], offset); + w[30] = amd_bytealign_S (w[20], w[21], offset); + w[29] = amd_bytealign_S (w[19], w[20], offset); + w[28] = amd_bytealign_S (w[18], w[19], offset); + w[27] = amd_bytealign_S (w[17], w[18], offset); + w[26] = amd_bytealign_S (w[16], w[17], offset); + w[25] = amd_bytealign_S (w[15], w[16], offset); + w[24] = amd_bytealign_S (w[14], w[15], offset); + w[23] = amd_bytealign_S (w[13], w[14], offset); + w[22] = amd_bytealign_S (w[12], w[13], offset); + w[21] = amd_bytealign_S (w[11], w[12], offset); + w[20] = amd_bytealign_S (w[10], w[11], offset); + w[19] = amd_bytealign_S (w[ 9], w[10], offset); + w[18] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[17] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[16] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[15] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[14] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[13] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[12] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[11] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[10] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 9] = amd_bytealign_S ( 0, w[ 0], offset); w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; @@ -26849,122 +44795,63 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 10: - w[63] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[62] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[61] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[60] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[59] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[58] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[57] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[56] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[55] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[54] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[53] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[52] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[51] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[50] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[49] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[48] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[47] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[46] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[45] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[44] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[43] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[42] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[41] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[40] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[39] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[38] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[37] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[36] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[35] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[34] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[33] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[32] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[31] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[30] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[29] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[28] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[27] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[26] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[25] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[24] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[23] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[22] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[21] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[20] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[19] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[18] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[17] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[16] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[15] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[14] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[13] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[12] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[11] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[10] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[52], w[53], offset); + w[62] = amd_bytealign_S (w[51], w[52], offset); + w[61] = amd_bytealign_S (w[50], w[51], offset); + w[60] = amd_bytealign_S (w[49], w[50], offset); + w[59] = amd_bytealign_S (w[48], w[49], offset); + w[58] = amd_bytealign_S (w[47], w[48], offset); + w[57] = amd_bytealign_S (w[46], w[47], offset); + w[56] = amd_bytealign_S (w[45], w[46], offset); + w[55] = amd_bytealign_S (w[44], w[45], offset); + w[54] = amd_bytealign_S (w[43], w[44], offset); + w[53] = amd_bytealign_S (w[42], w[43], offset); + w[52] = amd_bytealign_S (w[41], w[42], offset); + w[51] = amd_bytealign_S (w[40], w[41], offset); + w[50] = amd_bytealign_S (w[39], w[40], offset); + w[49] = amd_bytealign_S (w[38], w[39], offset); + w[48] = amd_bytealign_S (w[37], w[38], offset); + w[47] = amd_bytealign_S (w[36], w[37], offset); + w[46] = amd_bytealign_S (w[35], w[36], offset); + w[45] = amd_bytealign_S (w[34], w[35], offset); + w[44] = amd_bytealign_S (w[33], w[34], offset); + w[43] = amd_bytealign_S (w[32], w[33], offset); + w[42] = amd_bytealign_S (w[31], w[32], offset); + w[41] = amd_bytealign_S (w[30], w[31], offset); + w[40] = amd_bytealign_S (w[29], w[30], offset); + w[39] = amd_bytealign_S (w[28], w[29], offset); + w[38] = amd_bytealign_S (w[27], w[28], offset); + w[37] = amd_bytealign_S (w[26], w[27], offset); + w[36] = amd_bytealign_S (w[25], w[26], offset); + w[35] = amd_bytealign_S (w[24], w[25], offset); + w[34] = amd_bytealign_S (w[23], w[24], offset); + w[33] = amd_bytealign_S (w[22], w[23], offset); + w[32] = amd_bytealign_S (w[21], w[22], offset); + w[31] = amd_bytealign_S (w[20], w[21], offset); + w[30] = amd_bytealign_S (w[19], w[20], offset); + w[29] = amd_bytealign_S (w[18], w[19], offset); + w[28] = amd_bytealign_S (w[17], w[18], offset); + w[27] = amd_bytealign_S (w[16], w[17], offset); + w[26] = amd_bytealign_S (w[15], w[16], offset); + w[25] = amd_bytealign_S (w[14], w[15], offset); + w[24] = amd_bytealign_S (w[13], w[14], offset); + w[23] = amd_bytealign_S (w[12], w[13], offset); + w[22] = amd_bytealign_S (w[11], w[12], offset); + w[21] = amd_bytealign_S (w[10], w[11], offset); + w[20] = amd_bytealign_S (w[ 9], w[10], offset); + w[19] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[18] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[17] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[16] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[15] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[14] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[13] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[12] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[11] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[10] = amd_bytealign_S ( 0, w[ 0], offset); w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; @@ -26976,120 +44863,62 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 11: - w[63] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[62] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[61] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[60] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[59] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[58] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[57] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[56] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[55] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[54] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[53] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[52] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[51] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[50] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[49] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[48] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[47] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[46] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[45] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[44] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[43] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[42] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[41] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[40] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[39] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[38] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[37] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[36] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[35] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[34] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[33] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[32] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[31] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[30] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[29] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[28] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[27] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[26] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[25] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[24] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[23] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[22] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[21] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[20] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[19] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[18] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[17] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[16] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[15] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[14] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[13] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[12] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[11] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[51], w[52], offset); + w[62] = amd_bytealign_S (w[50], w[51], offset); + w[61] = amd_bytealign_S (w[49], w[50], offset); + w[60] = amd_bytealign_S (w[48], w[49], offset); + w[59] = amd_bytealign_S (w[47], w[48], offset); + w[58] = amd_bytealign_S (w[46], w[47], offset); + w[57] = amd_bytealign_S (w[45], w[46], offset); + w[56] = amd_bytealign_S (w[44], w[45], offset); + w[55] = amd_bytealign_S (w[43], w[44], offset); + w[54] = amd_bytealign_S (w[42], w[43], offset); + w[53] = amd_bytealign_S (w[41], w[42], offset); + w[52] = amd_bytealign_S (w[40], w[41], offset); + w[51] = amd_bytealign_S (w[39], w[40], offset); + w[50] = amd_bytealign_S (w[38], w[39], offset); + w[49] = amd_bytealign_S (w[37], w[38], offset); + w[48] = amd_bytealign_S (w[36], w[37], offset); + w[47] = amd_bytealign_S (w[35], w[36], offset); + w[46] = amd_bytealign_S (w[34], w[35], offset); + w[45] = amd_bytealign_S (w[33], w[34], offset); + w[44] = amd_bytealign_S (w[32], w[33], offset); + w[43] = amd_bytealign_S (w[31], w[32], offset); + w[42] = amd_bytealign_S (w[30], w[31], offset); + w[41] = amd_bytealign_S (w[29], w[30], offset); + w[40] = amd_bytealign_S (w[28], w[29], offset); + w[39] = amd_bytealign_S (w[27], w[28], offset); + w[38] = amd_bytealign_S (w[26], w[27], offset); + w[37] = amd_bytealign_S (w[25], w[26], offset); + w[36] = amd_bytealign_S (w[24], w[25], offset); + w[35] = amd_bytealign_S (w[23], w[24], offset); + w[34] = amd_bytealign_S (w[22], w[23], offset); + w[33] = amd_bytealign_S (w[21], w[22], offset); + w[32] = amd_bytealign_S (w[20], w[21], offset); + w[31] = amd_bytealign_S (w[19], w[20], offset); + w[30] = amd_bytealign_S (w[18], w[19], offset); + w[29] = amd_bytealign_S (w[17], w[18], offset); + w[28] = amd_bytealign_S (w[16], w[17], offset); + w[27] = amd_bytealign_S (w[15], w[16], offset); + w[26] = amd_bytealign_S (w[14], w[15], offset); + w[25] = amd_bytealign_S (w[13], w[14], offset); + w[24] = amd_bytealign_S (w[12], w[13], offset); + w[23] = amd_bytealign_S (w[11], w[12], offset); + w[22] = amd_bytealign_S (w[10], w[11], offset); + w[21] = amd_bytealign_S (w[ 9], w[10], offset); + w[20] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[19] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[18] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[17] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[16] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[15] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[14] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[13] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[12] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[11] = amd_bytealign_S ( 0, w[ 0], offset); w[10] = 0; w[ 9] = 0; w[ 8] = 0; @@ -27102,118 +44931,61 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 12: - w[63] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[62] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[61] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[60] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[59] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[58] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[57] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[56] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[55] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[54] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[53] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[52] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[51] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[50] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[49] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[48] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[47] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[46] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[45] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[44] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[43] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[42] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[41] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[40] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[39] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[38] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[37] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[36] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[35] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[34] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[33] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[32] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[31] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[30] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[29] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[28] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[27] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[26] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[25] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[24] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[23] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[22] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[21] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[20] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[19] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[18] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[17] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[16] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[15] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[14] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[13] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[12] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[50], w[51], offset); + w[62] = amd_bytealign_S (w[49], w[50], offset); + w[61] = amd_bytealign_S (w[48], w[49], offset); + w[60] = amd_bytealign_S (w[47], w[48], offset); + w[59] = amd_bytealign_S (w[46], w[47], offset); + w[58] = amd_bytealign_S (w[45], w[46], offset); + w[57] = amd_bytealign_S (w[44], w[45], offset); + w[56] = amd_bytealign_S (w[43], w[44], offset); + w[55] = amd_bytealign_S (w[42], w[43], offset); + w[54] = amd_bytealign_S (w[41], w[42], offset); + w[53] = amd_bytealign_S (w[40], w[41], offset); + w[52] = amd_bytealign_S (w[39], w[40], offset); + w[51] = amd_bytealign_S (w[38], w[39], offset); + w[50] = amd_bytealign_S (w[37], w[38], offset); + w[49] = amd_bytealign_S (w[36], w[37], offset); + w[48] = amd_bytealign_S (w[35], w[36], offset); + w[47] = amd_bytealign_S (w[34], w[35], offset); + w[46] = amd_bytealign_S (w[33], w[34], offset); + w[45] = amd_bytealign_S (w[32], w[33], offset); + w[44] = amd_bytealign_S (w[31], w[32], offset); + w[43] = amd_bytealign_S (w[30], w[31], offset); + w[42] = amd_bytealign_S (w[29], w[30], offset); + w[41] = amd_bytealign_S (w[28], w[29], offset); + w[40] = amd_bytealign_S (w[27], w[28], offset); + w[39] = amd_bytealign_S (w[26], w[27], offset); + w[38] = amd_bytealign_S (w[25], w[26], offset); + w[37] = amd_bytealign_S (w[24], w[25], offset); + w[36] = amd_bytealign_S (w[23], w[24], offset); + w[35] = amd_bytealign_S (w[22], w[23], offset); + w[34] = amd_bytealign_S (w[21], w[22], offset); + w[33] = amd_bytealign_S (w[20], w[21], offset); + w[32] = amd_bytealign_S (w[19], w[20], offset); + w[31] = amd_bytealign_S (w[18], w[19], offset); + w[30] = amd_bytealign_S (w[17], w[18], offset); + w[29] = amd_bytealign_S (w[16], w[17], offset); + w[28] = amd_bytealign_S (w[15], w[16], offset); + w[27] = amd_bytealign_S (w[14], w[15], offset); + w[26] = amd_bytealign_S (w[13], w[14], offset); + w[25] = amd_bytealign_S (w[12], w[13], offset); + w[24] = amd_bytealign_S (w[11], w[12], offset); + w[23] = amd_bytealign_S (w[10], w[11], offset); + w[22] = amd_bytealign_S (w[ 9], w[10], offset); + w[21] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[20] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[19] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[18] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[17] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[16] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[15] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[14] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[13] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[12] = amd_bytealign_S ( 0, w[ 0], offset); w[11] = 0; w[10] = 0; w[ 9] = 0; @@ -27227,116 +44999,60 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 13: - w[63] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[62] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[61] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[60] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[59] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[58] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[57] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[56] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[55] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[54] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[53] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[52] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[51] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[50] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[49] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[48] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[47] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[46] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[45] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[44] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[43] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[42] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[41] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[40] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[39] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[38] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[37] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[36] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[35] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[34] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[33] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[32] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[31] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[30] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[29] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[28] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[27] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[26] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[25] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[24] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[23] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[22] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[21] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[20] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[19] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[18] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[17] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[16] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[15] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[14] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[13] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[49], w[50], offset); + w[62] = amd_bytealign_S (w[48], w[49], offset); + w[61] = amd_bytealign_S (w[47], w[48], offset); + w[60] = amd_bytealign_S (w[46], w[47], offset); + w[59] = amd_bytealign_S (w[45], w[46], offset); + w[58] = amd_bytealign_S (w[44], w[45], offset); + w[57] = amd_bytealign_S (w[43], w[44], offset); + w[56] = amd_bytealign_S (w[42], w[43], offset); + w[55] = amd_bytealign_S (w[41], w[42], offset); + w[54] = amd_bytealign_S (w[40], w[41], offset); + w[53] = amd_bytealign_S (w[39], w[40], offset); + w[52] = amd_bytealign_S (w[38], w[39], offset); + w[51] = amd_bytealign_S (w[37], w[38], offset); + w[50] = amd_bytealign_S (w[36], w[37], offset); + w[49] = amd_bytealign_S (w[35], w[36], offset); + w[48] = amd_bytealign_S (w[34], w[35], offset); + w[47] = amd_bytealign_S (w[33], w[34], offset); + w[46] = amd_bytealign_S (w[32], w[33], offset); + w[45] = amd_bytealign_S (w[31], w[32], offset); + w[44] = amd_bytealign_S (w[30], w[31], offset); + w[43] = amd_bytealign_S (w[29], w[30], offset); + w[42] = amd_bytealign_S (w[28], w[29], offset); + w[41] = amd_bytealign_S (w[27], w[28], offset); + w[40] = amd_bytealign_S (w[26], w[27], offset); + w[39] = amd_bytealign_S (w[25], w[26], offset); + w[38] = amd_bytealign_S (w[24], w[25], offset); + w[37] = amd_bytealign_S (w[23], w[24], offset); + w[36] = amd_bytealign_S (w[22], w[23], offset); + w[35] = amd_bytealign_S (w[21], w[22], offset); + w[34] = amd_bytealign_S (w[20], w[21], offset); + w[33] = amd_bytealign_S (w[19], w[20], offset); + w[32] = amd_bytealign_S (w[18], w[19], offset); + w[31] = amd_bytealign_S (w[17], w[18], offset); + w[30] = amd_bytealign_S (w[16], w[17], offset); + w[29] = amd_bytealign_S (w[15], w[16], offset); + w[28] = amd_bytealign_S (w[14], w[15], offset); + w[27] = amd_bytealign_S (w[13], w[14], offset); + w[26] = amd_bytealign_S (w[12], w[13], offset); + w[25] = amd_bytealign_S (w[11], w[12], offset); + w[24] = amd_bytealign_S (w[10], w[11], offset); + w[23] = amd_bytealign_S (w[ 9], w[10], offset); + w[22] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[21] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[20] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[19] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[18] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[17] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[16] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[15] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[14] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[13] = amd_bytealign_S ( 0, w[ 0], offset); w[12] = 0; w[11] = 0; w[10] = 0; @@ -27351,114 +45067,59 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 14: - w[63] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[62] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[61] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[60] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[59] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[58] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[57] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[56] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[55] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[54] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[53] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[52] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[51] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[50] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[49] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[48] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[47] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[46] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[45] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[44] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[43] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[42] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[41] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[40] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[39] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[38] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[37] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[36] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[35] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[34] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[33] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[32] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[31] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[30] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[29] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[28] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[27] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[26] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[25] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[24] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[23] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[22] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[21] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[20] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[19] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[18] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[17] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[16] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[15] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[14] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[48], w[49], offset); + w[62] = amd_bytealign_S (w[47], w[48], offset); + w[61] = amd_bytealign_S (w[46], w[47], offset); + w[60] = amd_bytealign_S (w[45], w[46], offset); + w[59] = amd_bytealign_S (w[44], w[45], offset); + w[58] = amd_bytealign_S (w[43], w[44], offset); + w[57] = amd_bytealign_S (w[42], w[43], offset); + w[56] = amd_bytealign_S (w[41], w[42], offset); + w[55] = amd_bytealign_S (w[40], w[41], offset); + w[54] = amd_bytealign_S (w[39], w[40], offset); + w[53] = amd_bytealign_S (w[38], w[39], offset); + w[52] = amd_bytealign_S (w[37], w[38], offset); + w[51] = amd_bytealign_S (w[36], w[37], offset); + w[50] = amd_bytealign_S (w[35], w[36], offset); + w[49] = amd_bytealign_S (w[34], w[35], offset); + w[48] = amd_bytealign_S (w[33], w[34], offset); + w[47] = amd_bytealign_S (w[32], w[33], offset); + w[46] = amd_bytealign_S (w[31], w[32], offset); + w[45] = amd_bytealign_S (w[30], w[31], offset); + w[44] = amd_bytealign_S (w[29], w[30], offset); + w[43] = amd_bytealign_S (w[28], w[29], offset); + w[42] = amd_bytealign_S (w[27], w[28], offset); + w[41] = amd_bytealign_S (w[26], w[27], offset); + w[40] = amd_bytealign_S (w[25], w[26], offset); + w[39] = amd_bytealign_S (w[24], w[25], offset); + w[38] = amd_bytealign_S (w[23], w[24], offset); + w[37] = amd_bytealign_S (w[22], w[23], offset); + w[36] = amd_bytealign_S (w[21], w[22], offset); + w[35] = amd_bytealign_S (w[20], w[21], offset); + w[34] = amd_bytealign_S (w[19], w[20], offset); + w[33] = amd_bytealign_S (w[18], w[19], offset); + w[32] = amd_bytealign_S (w[17], w[18], offset); + w[31] = amd_bytealign_S (w[16], w[17], offset); + w[30] = amd_bytealign_S (w[15], w[16], offset); + w[29] = amd_bytealign_S (w[14], w[15], offset); + w[28] = amd_bytealign_S (w[13], w[14], offset); + w[27] = amd_bytealign_S (w[12], w[13], offset); + w[26] = amd_bytealign_S (w[11], w[12], offset); + w[25] = amd_bytealign_S (w[10], w[11], offset); + w[24] = amd_bytealign_S (w[ 9], w[10], offset); + w[23] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[22] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[21] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[20] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[19] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[18] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[17] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[16] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[15] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[14] = amd_bytealign_S ( 0, w[ 0], offset); w[13] = 0; w[12] = 0; w[11] = 0; @@ -27474,112 +45135,58 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 15: - w[63] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[62] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[61] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[60] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[59] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[58] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[57] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[56] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[55] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[54] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[53] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[52] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[51] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[50] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[49] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[48] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[47] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[46] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[45] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[44] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[43] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[42] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[41] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[40] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[39] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[38] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[37] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[36] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[35] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[34] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[33] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[32] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[31] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[30] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[29] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[28] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[27] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[26] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[25] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[24] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[23] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[22] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[21] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[20] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[19] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[18] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[17] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[16] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[15] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[47], w[48], offset); + w[62] = amd_bytealign_S (w[46], w[47], offset); + w[61] = amd_bytealign_S (w[45], w[46], offset); + w[60] = amd_bytealign_S (w[44], w[45], offset); + w[59] = amd_bytealign_S (w[43], w[44], offset); + w[58] = amd_bytealign_S (w[42], w[43], offset); + w[57] = amd_bytealign_S (w[41], w[42], offset); + w[56] = amd_bytealign_S (w[40], w[41], offset); + w[55] = amd_bytealign_S (w[39], w[40], offset); + w[54] = amd_bytealign_S (w[38], w[39], offset); + w[53] = amd_bytealign_S (w[37], w[38], offset); + w[52] = amd_bytealign_S (w[36], w[37], offset); + w[51] = amd_bytealign_S (w[35], w[36], offset); + w[50] = amd_bytealign_S (w[34], w[35], offset); + w[49] = amd_bytealign_S (w[33], w[34], offset); + w[48] = amd_bytealign_S (w[32], w[33], offset); + w[47] = amd_bytealign_S (w[31], w[32], offset); + w[46] = amd_bytealign_S (w[30], w[31], offset); + w[45] = amd_bytealign_S (w[29], w[30], offset); + w[44] = amd_bytealign_S (w[28], w[29], offset); + w[43] = amd_bytealign_S (w[27], w[28], offset); + w[42] = amd_bytealign_S (w[26], w[27], offset); + w[41] = amd_bytealign_S (w[25], w[26], offset); + w[40] = amd_bytealign_S (w[24], w[25], offset); + w[39] = amd_bytealign_S (w[23], w[24], offset); + w[38] = amd_bytealign_S (w[22], w[23], offset); + w[37] = amd_bytealign_S (w[21], w[22], offset); + w[36] = amd_bytealign_S (w[20], w[21], offset); + w[35] = amd_bytealign_S (w[19], w[20], offset); + w[34] = amd_bytealign_S (w[18], w[19], offset); + w[33] = amd_bytealign_S (w[17], w[18], offset); + w[32] = amd_bytealign_S (w[16], w[17], offset); + w[31] = amd_bytealign_S (w[15], w[16], offset); + w[30] = amd_bytealign_S (w[14], w[15], offset); + w[29] = amd_bytealign_S (w[13], w[14], offset); + w[28] = amd_bytealign_S (w[12], w[13], offset); + w[27] = amd_bytealign_S (w[11], w[12], offset); + w[26] = amd_bytealign_S (w[10], w[11], offset); + w[25] = amd_bytealign_S (w[ 9], w[10], offset); + w[24] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[23] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[22] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[21] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[20] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[19] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[18] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[17] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[16] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[15] = amd_bytealign_S ( 0, w[ 0], offset); w[14] = 0; w[13] = 0; w[12] = 0; @@ -27596,110 +45203,57 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 16: - w[63] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[62] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[61] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[60] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[59] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[58] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[57] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[56] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[55] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[54] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[53] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[52] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[51] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[50] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[49] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[48] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[47] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[46] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[45] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[44] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[43] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[42] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[41] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[40] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[39] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[38] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[37] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[36] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[35] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[34] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[33] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[32] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[31] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[30] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[29] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[28] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[27] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[26] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[25] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[24] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[23] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[22] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[21] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[20] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[19] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[18] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[17] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[16] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[46], w[47], offset); + w[62] = amd_bytealign_S (w[45], w[46], offset); + w[61] = amd_bytealign_S (w[44], w[45], offset); + w[60] = amd_bytealign_S (w[43], w[44], offset); + w[59] = amd_bytealign_S (w[42], w[43], offset); + w[58] = amd_bytealign_S (w[41], w[42], offset); + w[57] = amd_bytealign_S (w[40], w[41], offset); + w[56] = amd_bytealign_S (w[39], w[40], offset); + w[55] = amd_bytealign_S (w[38], w[39], offset); + w[54] = amd_bytealign_S (w[37], w[38], offset); + w[53] = amd_bytealign_S (w[36], w[37], offset); + w[52] = amd_bytealign_S (w[35], w[36], offset); + w[51] = amd_bytealign_S (w[34], w[35], offset); + w[50] = amd_bytealign_S (w[33], w[34], offset); + w[49] = amd_bytealign_S (w[32], w[33], offset); + w[48] = amd_bytealign_S (w[31], w[32], offset); + w[47] = amd_bytealign_S (w[30], w[31], offset); + w[46] = amd_bytealign_S (w[29], w[30], offset); + w[45] = amd_bytealign_S (w[28], w[29], offset); + w[44] = amd_bytealign_S (w[27], w[28], offset); + w[43] = amd_bytealign_S (w[26], w[27], offset); + w[42] = amd_bytealign_S (w[25], w[26], offset); + w[41] = amd_bytealign_S (w[24], w[25], offset); + w[40] = amd_bytealign_S (w[23], w[24], offset); + w[39] = amd_bytealign_S (w[22], w[23], offset); + w[38] = amd_bytealign_S (w[21], w[22], offset); + w[37] = amd_bytealign_S (w[20], w[21], offset); + w[36] = amd_bytealign_S (w[19], w[20], offset); + w[35] = amd_bytealign_S (w[18], w[19], offset); + w[34] = amd_bytealign_S (w[17], w[18], offset); + w[33] = amd_bytealign_S (w[16], w[17], offset); + w[32] = amd_bytealign_S (w[15], w[16], offset); + w[31] = amd_bytealign_S (w[14], w[15], offset); + w[30] = amd_bytealign_S (w[13], w[14], offset); + w[29] = amd_bytealign_S (w[12], w[13], offset); + w[28] = amd_bytealign_S (w[11], w[12], offset); + w[27] = amd_bytealign_S (w[10], w[11], offset); + w[26] = amd_bytealign_S (w[ 9], w[10], offset); + w[25] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[24] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[23] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[22] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[21] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[20] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[19] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[18] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[17] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[16] = amd_bytealign_S ( 0, w[ 0], offset); w[15] = 0; w[14] = 0; w[13] = 0; @@ -27717,108 +45271,56 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 17: - w[63] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[62] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[61] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[60] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[59] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[58] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[57] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[56] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[55] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[54] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[53] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[52] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[51] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[50] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[49] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[48] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[47] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[46] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[45] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[44] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[43] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[42] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[41] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[40] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[39] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[38] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[37] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[36] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[35] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[34] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[33] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[32] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[31] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[30] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[29] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[28] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[27] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[26] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[25] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[24] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[23] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[22] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[21] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[20] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[19] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[18] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[17] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[45], w[46], offset); + w[62] = amd_bytealign_S (w[44], w[45], offset); + w[61] = amd_bytealign_S (w[43], w[44], offset); + w[60] = amd_bytealign_S (w[42], w[43], offset); + w[59] = amd_bytealign_S (w[41], w[42], offset); + w[58] = amd_bytealign_S (w[40], w[41], offset); + w[57] = amd_bytealign_S (w[39], w[40], offset); + w[56] = amd_bytealign_S (w[38], w[39], offset); + w[55] = amd_bytealign_S (w[37], w[38], offset); + w[54] = amd_bytealign_S (w[36], w[37], offset); + w[53] = amd_bytealign_S (w[35], w[36], offset); + w[52] = amd_bytealign_S (w[34], w[35], offset); + w[51] = amd_bytealign_S (w[33], w[34], offset); + w[50] = amd_bytealign_S (w[32], w[33], offset); + w[49] = amd_bytealign_S (w[31], w[32], offset); + w[48] = amd_bytealign_S (w[30], w[31], offset); + w[47] = amd_bytealign_S (w[29], w[30], offset); + w[46] = amd_bytealign_S (w[28], w[29], offset); + w[45] = amd_bytealign_S (w[27], w[28], offset); + w[44] = amd_bytealign_S (w[26], w[27], offset); + w[43] = amd_bytealign_S (w[25], w[26], offset); + w[42] = amd_bytealign_S (w[24], w[25], offset); + w[41] = amd_bytealign_S (w[23], w[24], offset); + w[40] = amd_bytealign_S (w[22], w[23], offset); + w[39] = amd_bytealign_S (w[21], w[22], offset); + w[38] = amd_bytealign_S (w[20], w[21], offset); + w[37] = amd_bytealign_S (w[19], w[20], offset); + w[36] = amd_bytealign_S (w[18], w[19], offset); + w[35] = amd_bytealign_S (w[17], w[18], offset); + w[34] = amd_bytealign_S (w[16], w[17], offset); + w[33] = amd_bytealign_S (w[15], w[16], offset); + w[32] = amd_bytealign_S (w[14], w[15], offset); + w[31] = amd_bytealign_S (w[13], w[14], offset); + w[30] = amd_bytealign_S (w[12], w[13], offset); + w[29] = amd_bytealign_S (w[11], w[12], offset); + w[28] = amd_bytealign_S (w[10], w[11], offset); + w[27] = amd_bytealign_S (w[ 9], w[10], offset); + w[26] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[25] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[24] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[23] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[22] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[21] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[20] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[19] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[18] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[17] = amd_bytealign_S ( 0, w[ 0], offset); w[16] = 0; w[15] = 0; w[14] = 0; @@ -27837,106 +45339,55 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 18: - w[63] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[62] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[61] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[60] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[59] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[58] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[57] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[56] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[55] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[54] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[53] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[52] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[51] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[50] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[49] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[48] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[47] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[46] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[45] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[44] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[43] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[42] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[41] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[40] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[39] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[38] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[37] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[36] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[35] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[34] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[33] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[32] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[31] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[30] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[29] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[28] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[27] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[26] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[25] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[24] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[23] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[22] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[21] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[20] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[19] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[18] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[44], w[45], offset); + w[62] = amd_bytealign_S (w[43], w[44], offset); + w[61] = amd_bytealign_S (w[42], w[43], offset); + w[60] = amd_bytealign_S (w[41], w[42], offset); + w[59] = amd_bytealign_S (w[40], w[41], offset); + w[58] = amd_bytealign_S (w[39], w[40], offset); + w[57] = amd_bytealign_S (w[38], w[39], offset); + w[56] = amd_bytealign_S (w[37], w[38], offset); + w[55] = amd_bytealign_S (w[36], w[37], offset); + w[54] = amd_bytealign_S (w[35], w[36], offset); + w[53] = amd_bytealign_S (w[34], w[35], offset); + w[52] = amd_bytealign_S (w[33], w[34], offset); + w[51] = amd_bytealign_S (w[32], w[33], offset); + w[50] = amd_bytealign_S (w[31], w[32], offset); + w[49] = amd_bytealign_S (w[30], w[31], offset); + w[48] = amd_bytealign_S (w[29], w[30], offset); + w[47] = amd_bytealign_S (w[28], w[29], offset); + w[46] = amd_bytealign_S (w[27], w[28], offset); + w[45] = amd_bytealign_S (w[26], w[27], offset); + w[44] = amd_bytealign_S (w[25], w[26], offset); + w[43] = amd_bytealign_S (w[24], w[25], offset); + w[42] = amd_bytealign_S (w[23], w[24], offset); + w[41] = amd_bytealign_S (w[22], w[23], offset); + w[40] = amd_bytealign_S (w[21], w[22], offset); + w[39] = amd_bytealign_S (w[20], w[21], offset); + w[38] = amd_bytealign_S (w[19], w[20], offset); + w[37] = amd_bytealign_S (w[18], w[19], offset); + w[36] = amd_bytealign_S (w[17], w[18], offset); + w[35] = amd_bytealign_S (w[16], w[17], offset); + w[34] = amd_bytealign_S (w[15], w[16], offset); + w[33] = amd_bytealign_S (w[14], w[15], offset); + w[32] = amd_bytealign_S (w[13], w[14], offset); + w[31] = amd_bytealign_S (w[12], w[13], offset); + w[30] = amd_bytealign_S (w[11], w[12], offset); + w[29] = amd_bytealign_S (w[10], w[11], offset); + w[28] = amd_bytealign_S (w[ 9], w[10], offset); + w[27] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[26] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[25] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[24] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[23] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[22] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[21] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[20] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[19] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[18] = amd_bytealign_S ( 0, w[ 0], offset); w[17] = 0; w[16] = 0; w[15] = 0; @@ -27956,104 +45407,54 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 19: - w[63] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[62] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[61] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[60] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[59] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[58] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[57] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[56] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[55] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[54] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[53] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[52] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[51] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[50] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[49] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[48] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[47] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[46] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[45] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[44] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[43] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[42] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[41] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[40] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[39] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[38] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[37] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[36] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[35] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[34] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[33] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[32] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[31] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[30] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[29] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[28] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[27] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[26] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[25] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[24] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[23] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[22] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[21] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[20] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[19] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[43], w[44], offset); + w[62] = amd_bytealign_S (w[42], w[43], offset); + w[61] = amd_bytealign_S (w[41], w[42], offset); + w[60] = amd_bytealign_S (w[40], w[41], offset); + w[59] = amd_bytealign_S (w[39], w[40], offset); + w[58] = amd_bytealign_S (w[38], w[39], offset); + w[57] = amd_bytealign_S (w[37], w[38], offset); + w[56] = amd_bytealign_S (w[36], w[37], offset); + w[55] = amd_bytealign_S (w[35], w[36], offset); + w[54] = amd_bytealign_S (w[34], w[35], offset); + w[53] = amd_bytealign_S (w[33], w[34], offset); + w[52] = amd_bytealign_S (w[32], w[33], offset); + w[51] = amd_bytealign_S (w[31], w[32], offset); + w[50] = amd_bytealign_S (w[30], w[31], offset); + w[49] = amd_bytealign_S (w[29], w[30], offset); + w[48] = amd_bytealign_S (w[28], w[29], offset); + w[47] = amd_bytealign_S (w[27], w[28], offset); + w[46] = amd_bytealign_S (w[26], w[27], offset); + w[45] = amd_bytealign_S (w[25], w[26], offset); + w[44] = amd_bytealign_S (w[24], w[25], offset); + w[43] = amd_bytealign_S (w[23], w[24], offset); + w[42] = amd_bytealign_S (w[22], w[23], offset); + w[41] = amd_bytealign_S (w[21], w[22], offset); + w[40] = amd_bytealign_S (w[20], w[21], offset); + w[39] = amd_bytealign_S (w[19], w[20], offset); + w[38] = amd_bytealign_S (w[18], w[19], offset); + w[37] = amd_bytealign_S (w[17], w[18], offset); + w[36] = amd_bytealign_S (w[16], w[17], offset); + w[35] = amd_bytealign_S (w[15], w[16], offset); + w[34] = amd_bytealign_S (w[14], w[15], offset); + w[33] = amd_bytealign_S (w[13], w[14], offset); + w[32] = amd_bytealign_S (w[12], w[13], offset); + w[31] = amd_bytealign_S (w[11], w[12], offset); + w[30] = amd_bytealign_S (w[10], w[11], offset); + w[29] = amd_bytealign_S (w[ 9], w[10], offset); + w[28] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[27] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[26] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[25] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[24] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[23] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[22] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[21] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[20] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[19] = amd_bytealign_S ( 0, w[ 0], offset); w[18] = 0; w[17] = 0; w[16] = 0; @@ -28074,102 +45475,53 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 20: - w[63] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[62] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[61] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[60] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[59] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[58] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[57] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[56] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[55] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[54] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[53] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[52] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[51] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[50] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[49] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[48] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[47] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[46] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[45] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[44] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[43] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[42] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[41] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[40] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[39] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[38] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[37] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[36] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[35] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[34] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[33] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[32] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[31] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[30] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[29] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[28] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[27] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[26] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[25] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[24] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[23] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[22] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[21] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[20] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[42], w[43], offset); + w[62] = amd_bytealign_S (w[41], w[42], offset); + w[61] = amd_bytealign_S (w[40], w[41], offset); + w[60] = amd_bytealign_S (w[39], w[40], offset); + w[59] = amd_bytealign_S (w[38], w[39], offset); + w[58] = amd_bytealign_S (w[37], w[38], offset); + w[57] = amd_bytealign_S (w[36], w[37], offset); + w[56] = amd_bytealign_S (w[35], w[36], offset); + w[55] = amd_bytealign_S (w[34], w[35], offset); + w[54] = amd_bytealign_S (w[33], w[34], offset); + w[53] = amd_bytealign_S (w[32], w[33], offset); + w[52] = amd_bytealign_S (w[31], w[32], offset); + w[51] = amd_bytealign_S (w[30], w[31], offset); + w[50] = amd_bytealign_S (w[29], w[30], offset); + w[49] = amd_bytealign_S (w[28], w[29], offset); + w[48] = amd_bytealign_S (w[27], w[28], offset); + w[47] = amd_bytealign_S (w[26], w[27], offset); + w[46] = amd_bytealign_S (w[25], w[26], offset); + w[45] = amd_bytealign_S (w[24], w[25], offset); + w[44] = amd_bytealign_S (w[23], w[24], offset); + w[43] = amd_bytealign_S (w[22], w[23], offset); + w[42] = amd_bytealign_S (w[21], w[22], offset); + w[41] = amd_bytealign_S (w[20], w[21], offset); + w[40] = amd_bytealign_S (w[19], w[20], offset); + w[39] = amd_bytealign_S (w[18], w[19], offset); + w[38] = amd_bytealign_S (w[17], w[18], offset); + w[37] = amd_bytealign_S (w[16], w[17], offset); + w[36] = amd_bytealign_S (w[15], w[16], offset); + w[35] = amd_bytealign_S (w[14], w[15], offset); + w[34] = amd_bytealign_S (w[13], w[14], offset); + w[33] = amd_bytealign_S (w[12], w[13], offset); + w[32] = amd_bytealign_S (w[11], w[12], offset); + w[31] = amd_bytealign_S (w[10], w[11], offset); + w[30] = amd_bytealign_S (w[ 9], w[10], offset); + w[29] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[28] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[27] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[26] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[25] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[24] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[23] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[22] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[21] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[20] = amd_bytealign_S ( 0, w[ 0], offset); w[19] = 0; w[18] = 0; w[17] = 0; @@ -28191,100 +45543,52 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 21: - w[63] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[62] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[61] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[60] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[59] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[58] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[57] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[56] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[55] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[54] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[53] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[52] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[51] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[50] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[49] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[48] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[47] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[46] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[45] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[44] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[43] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[42] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[41] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[40] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[39] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[38] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[37] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[36] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[35] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[34] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[33] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[32] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[31] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[30] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[29] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[28] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[27] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[26] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[25] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[24] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[23] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[22] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[21] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[41], w[42], offset); + w[62] = amd_bytealign_S (w[40], w[41], offset); + w[61] = amd_bytealign_S (w[39], w[40], offset); + w[60] = amd_bytealign_S (w[38], w[39], offset); + w[59] = amd_bytealign_S (w[37], w[38], offset); + w[58] = amd_bytealign_S (w[36], w[37], offset); + w[57] = amd_bytealign_S (w[35], w[36], offset); + w[56] = amd_bytealign_S (w[34], w[35], offset); + w[55] = amd_bytealign_S (w[33], w[34], offset); + w[54] = amd_bytealign_S (w[32], w[33], offset); + w[53] = amd_bytealign_S (w[31], w[32], offset); + w[52] = amd_bytealign_S (w[30], w[31], offset); + w[51] = amd_bytealign_S (w[29], w[30], offset); + w[50] = amd_bytealign_S (w[28], w[29], offset); + w[49] = amd_bytealign_S (w[27], w[28], offset); + w[48] = amd_bytealign_S (w[26], w[27], offset); + w[47] = amd_bytealign_S (w[25], w[26], offset); + w[46] = amd_bytealign_S (w[24], w[25], offset); + w[45] = amd_bytealign_S (w[23], w[24], offset); + w[44] = amd_bytealign_S (w[22], w[23], offset); + w[43] = amd_bytealign_S (w[21], w[22], offset); + w[42] = amd_bytealign_S (w[20], w[21], offset); + w[41] = amd_bytealign_S (w[19], w[20], offset); + w[40] = amd_bytealign_S (w[18], w[19], offset); + w[39] = amd_bytealign_S (w[17], w[18], offset); + w[38] = amd_bytealign_S (w[16], w[17], offset); + w[37] = amd_bytealign_S (w[15], w[16], offset); + w[36] = amd_bytealign_S (w[14], w[15], offset); + w[35] = amd_bytealign_S (w[13], w[14], offset); + w[34] = amd_bytealign_S (w[12], w[13], offset); + w[33] = amd_bytealign_S (w[11], w[12], offset); + w[32] = amd_bytealign_S (w[10], w[11], offset); + w[31] = amd_bytealign_S (w[ 9], w[10], offset); + w[30] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[29] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[28] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[27] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[26] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[25] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[24] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[23] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[22] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[21] = amd_bytealign_S ( 0, w[ 0], offset); w[20] = 0; w[19] = 0; w[18] = 0; @@ -28307,98 +45611,51 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 22: - w[63] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[62] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[61] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[60] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[59] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[58] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[57] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[56] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[55] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[54] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[53] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[52] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[51] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[50] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[49] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[48] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[47] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[46] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[45] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[44] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[43] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[42] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[41] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[40] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[39] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[38] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[37] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[36] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[35] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[34] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[33] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[32] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[31] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[30] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[29] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[28] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[27] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[26] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[25] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[24] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[23] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[22] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[40], w[41], offset); + w[62] = amd_bytealign_S (w[39], w[40], offset); + w[61] = amd_bytealign_S (w[38], w[39], offset); + w[60] = amd_bytealign_S (w[37], w[38], offset); + w[59] = amd_bytealign_S (w[36], w[37], offset); + w[58] = amd_bytealign_S (w[35], w[36], offset); + w[57] = amd_bytealign_S (w[34], w[35], offset); + w[56] = amd_bytealign_S (w[33], w[34], offset); + w[55] = amd_bytealign_S (w[32], w[33], offset); + w[54] = amd_bytealign_S (w[31], w[32], offset); + w[53] = amd_bytealign_S (w[30], w[31], offset); + w[52] = amd_bytealign_S (w[29], w[30], offset); + w[51] = amd_bytealign_S (w[28], w[29], offset); + w[50] = amd_bytealign_S (w[27], w[28], offset); + w[49] = amd_bytealign_S (w[26], w[27], offset); + w[48] = amd_bytealign_S (w[25], w[26], offset); + w[47] = amd_bytealign_S (w[24], w[25], offset); + w[46] = amd_bytealign_S (w[23], w[24], offset); + w[45] = amd_bytealign_S (w[22], w[23], offset); + w[44] = amd_bytealign_S (w[21], w[22], offset); + w[43] = amd_bytealign_S (w[20], w[21], offset); + w[42] = amd_bytealign_S (w[19], w[20], offset); + w[41] = amd_bytealign_S (w[18], w[19], offset); + w[40] = amd_bytealign_S (w[17], w[18], offset); + w[39] = amd_bytealign_S (w[16], w[17], offset); + w[38] = amd_bytealign_S (w[15], w[16], offset); + w[37] = amd_bytealign_S (w[14], w[15], offset); + w[36] = amd_bytealign_S (w[13], w[14], offset); + w[35] = amd_bytealign_S (w[12], w[13], offset); + w[34] = amd_bytealign_S (w[11], w[12], offset); + w[33] = amd_bytealign_S (w[10], w[11], offset); + w[32] = amd_bytealign_S (w[ 9], w[10], offset); + w[31] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[30] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[29] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[28] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[27] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[26] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[25] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[24] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[23] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[22] = amd_bytealign_S ( 0, w[ 0], offset); w[21] = 0; w[20] = 0; w[19] = 0; @@ -28422,96 +45679,50 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 23: - w[63] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[62] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[61] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[60] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[59] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[58] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[57] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[56] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[55] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[54] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[53] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[52] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[51] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[50] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[49] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[48] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[47] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[46] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[45] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[44] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[43] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[42] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[41] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[40] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[39] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[38] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[37] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[36] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[35] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[34] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[33] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[32] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[31] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[30] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[29] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[28] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[27] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[26] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[25] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[24] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[23] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[39], w[40], offset); + w[62] = amd_bytealign_S (w[38], w[39], offset); + w[61] = amd_bytealign_S (w[37], w[38], offset); + w[60] = amd_bytealign_S (w[36], w[37], offset); + w[59] = amd_bytealign_S (w[35], w[36], offset); + w[58] = amd_bytealign_S (w[34], w[35], offset); + w[57] = amd_bytealign_S (w[33], w[34], offset); + w[56] = amd_bytealign_S (w[32], w[33], offset); + w[55] = amd_bytealign_S (w[31], w[32], offset); + w[54] = amd_bytealign_S (w[30], w[31], offset); + w[53] = amd_bytealign_S (w[29], w[30], offset); + w[52] = amd_bytealign_S (w[28], w[29], offset); + w[51] = amd_bytealign_S (w[27], w[28], offset); + w[50] = amd_bytealign_S (w[26], w[27], offset); + w[49] = amd_bytealign_S (w[25], w[26], offset); + w[48] = amd_bytealign_S (w[24], w[25], offset); + w[47] = amd_bytealign_S (w[23], w[24], offset); + w[46] = amd_bytealign_S (w[22], w[23], offset); + w[45] = amd_bytealign_S (w[21], w[22], offset); + w[44] = amd_bytealign_S (w[20], w[21], offset); + w[43] = amd_bytealign_S (w[19], w[20], offset); + w[42] = amd_bytealign_S (w[18], w[19], offset); + w[41] = amd_bytealign_S (w[17], w[18], offset); + w[40] = amd_bytealign_S (w[16], w[17], offset); + w[39] = amd_bytealign_S (w[15], w[16], offset); + w[38] = amd_bytealign_S (w[14], w[15], offset); + w[37] = amd_bytealign_S (w[13], w[14], offset); + w[36] = amd_bytealign_S (w[12], w[13], offset); + w[35] = amd_bytealign_S (w[11], w[12], offset); + w[34] = amd_bytealign_S (w[10], w[11], offset); + w[33] = amd_bytealign_S (w[ 9], w[10], offset); + w[32] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[31] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[30] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[29] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[28] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[27] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[26] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[25] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[24] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[23] = amd_bytealign_S ( 0, w[ 0], offset); w[22] = 0; w[21] = 0; w[20] = 0; @@ -28536,94 +45747,49 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 24: - w[63] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[62] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[61] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[60] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[59] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[58] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[57] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[56] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[55] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[54] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[53] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[52] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[51] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[50] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[49] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[48] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[47] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[46] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[45] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[44] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[43] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[42] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[41] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[40] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[39] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[38] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[37] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[36] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[35] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[34] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[33] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[32] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[31] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[30] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[29] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[28] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[27] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[26] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[25] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[24] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[38], w[39], offset); + w[62] = amd_bytealign_S (w[37], w[38], offset); + w[61] = amd_bytealign_S (w[36], w[37], offset); + w[60] = amd_bytealign_S (w[35], w[36], offset); + w[59] = amd_bytealign_S (w[34], w[35], offset); + w[58] = amd_bytealign_S (w[33], w[34], offset); + w[57] = amd_bytealign_S (w[32], w[33], offset); + w[56] = amd_bytealign_S (w[31], w[32], offset); + w[55] = amd_bytealign_S (w[30], w[31], offset); + w[54] = amd_bytealign_S (w[29], w[30], offset); + w[53] = amd_bytealign_S (w[28], w[29], offset); + w[52] = amd_bytealign_S (w[27], w[28], offset); + w[51] = amd_bytealign_S (w[26], w[27], offset); + w[50] = amd_bytealign_S (w[25], w[26], offset); + w[49] = amd_bytealign_S (w[24], w[25], offset); + w[48] = amd_bytealign_S (w[23], w[24], offset); + w[47] = amd_bytealign_S (w[22], w[23], offset); + w[46] = amd_bytealign_S (w[21], w[22], offset); + w[45] = amd_bytealign_S (w[20], w[21], offset); + w[44] = amd_bytealign_S (w[19], w[20], offset); + w[43] = amd_bytealign_S (w[18], w[19], offset); + w[42] = amd_bytealign_S (w[17], w[18], offset); + w[41] = amd_bytealign_S (w[16], w[17], offset); + w[40] = amd_bytealign_S (w[15], w[16], offset); + w[39] = amd_bytealign_S (w[14], w[15], offset); + w[38] = amd_bytealign_S (w[13], w[14], offset); + w[37] = amd_bytealign_S (w[12], w[13], offset); + w[36] = amd_bytealign_S (w[11], w[12], offset); + w[35] = amd_bytealign_S (w[10], w[11], offset); + w[34] = amd_bytealign_S (w[ 9], w[10], offset); + w[33] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[32] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[31] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[30] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[29] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[28] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[27] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[26] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[25] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[24] = amd_bytealign_S ( 0, w[ 0], offset); w[23] = 0; w[22] = 0; w[21] = 0; @@ -28649,92 +45815,48 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 25: - w[63] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[62] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[61] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[60] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[59] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[58] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[57] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[56] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[55] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[54] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[53] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[52] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[51] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[50] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[49] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[48] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[47] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[46] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[45] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[44] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[43] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[42] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[41] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[40] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[39] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[38] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[37] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[36] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[35] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[34] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[33] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[32] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[31] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[30] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[29] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[28] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[27] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[26] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[25] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[37], w[38], offset); + w[62] = amd_bytealign_S (w[36], w[37], offset); + w[61] = amd_bytealign_S (w[35], w[36], offset); + w[60] = amd_bytealign_S (w[34], w[35], offset); + w[59] = amd_bytealign_S (w[33], w[34], offset); + w[58] = amd_bytealign_S (w[32], w[33], offset); + w[57] = amd_bytealign_S (w[31], w[32], offset); + w[56] = amd_bytealign_S (w[30], w[31], offset); + w[55] = amd_bytealign_S (w[29], w[30], offset); + w[54] = amd_bytealign_S (w[28], w[29], offset); + w[53] = amd_bytealign_S (w[27], w[28], offset); + w[52] = amd_bytealign_S (w[26], w[27], offset); + w[51] = amd_bytealign_S (w[25], w[26], offset); + w[50] = amd_bytealign_S (w[24], w[25], offset); + w[49] = amd_bytealign_S (w[23], w[24], offset); + w[48] = amd_bytealign_S (w[22], w[23], offset); + w[47] = amd_bytealign_S (w[21], w[22], offset); + w[46] = amd_bytealign_S (w[20], w[21], offset); + w[45] = amd_bytealign_S (w[19], w[20], offset); + w[44] = amd_bytealign_S (w[18], w[19], offset); + w[43] = amd_bytealign_S (w[17], w[18], offset); + w[42] = amd_bytealign_S (w[16], w[17], offset); + w[41] = amd_bytealign_S (w[15], w[16], offset); + w[40] = amd_bytealign_S (w[14], w[15], offset); + w[39] = amd_bytealign_S (w[13], w[14], offset); + w[38] = amd_bytealign_S (w[12], w[13], offset); + w[37] = amd_bytealign_S (w[11], w[12], offset); + w[36] = amd_bytealign_S (w[10], w[11], offset); + w[35] = amd_bytealign_S (w[ 9], w[10], offset); + w[34] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[33] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[32] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[31] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[30] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[29] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[28] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[27] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[26] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[25] = amd_bytealign_S ( 0, w[ 0], offset); w[24] = 0; w[23] = 0; w[22] = 0; @@ -28761,90 +45883,47 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 26: - w[63] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[62] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[61] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[60] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[59] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[58] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[57] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[56] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[55] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[54] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[53] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[52] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[51] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[50] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[49] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[48] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[47] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[46] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[45] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[44] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[43] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[42] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[41] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[40] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[39] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[38] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[37] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[36] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[35] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[34] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[33] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[32] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[31] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[30] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[29] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[28] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[27] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[26] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[36], w[37], offset); + w[62] = amd_bytealign_S (w[35], w[36], offset); + w[61] = amd_bytealign_S (w[34], w[35], offset); + w[60] = amd_bytealign_S (w[33], w[34], offset); + w[59] = amd_bytealign_S (w[32], w[33], offset); + w[58] = amd_bytealign_S (w[31], w[32], offset); + w[57] = amd_bytealign_S (w[30], w[31], offset); + w[56] = amd_bytealign_S (w[29], w[30], offset); + w[55] = amd_bytealign_S (w[28], w[29], offset); + w[54] = amd_bytealign_S (w[27], w[28], offset); + w[53] = amd_bytealign_S (w[26], w[27], offset); + w[52] = amd_bytealign_S (w[25], w[26], offset); + w[51] = amd_bytealign_S (w[24], w[25], offset); + w[50] = amd_bytealign_S (w[23], w[24], offset); + w[49] = amd_bytealign_S (w[22], w[23], offset); + w[48] = amd_bytealign_S (w[21], w[22], offset); + w[47] = amd_bytealign_S (w[20], w[21], offset); + w[46] = amd_bytealign_S (w[19], w[20], offset); + w[45] = amd_bytealign_S (w[18], w[19], offset); + w[44] = amd_bytealign_S (w[17], w[18], offset); + w[43] = amd_bytealign_S (w[16], w[17], offset); + w[42] = amd_bytealign_S (w[15], w[16], offset); + w[41] = amd_bytealign_S (w[14], w[15], offset); + w[40] = amd_bytealign_S (w[13], w[14], offset); + w[39] = amd_bytealign_S (w[12], w[13], offset); + w[38] = amd_bytealign_S (w[11], w[12], offset); + w[37] = amd_bytealign_S (w[10], w[11], offset); + w[36] = amd_bytealign_S (w[ 9], w[10], offset); + w[35] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[34] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[33] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[32] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[31] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[30] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[29] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[28] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[27] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[26] = amd_bytealign_S ( 0, w[ 0], offset); w[25] = 0; w[24] = 0; w[23] = 0; @@ -28872,88 +45951,46 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 27: - w[63] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[62] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[61] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[60] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[59] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[58] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[57] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[56] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[55] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[54] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[53] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[52] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[51] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[50] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[49] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[48] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[47] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[46] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[45] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[44] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[43] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[42] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[41] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[40] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[39] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[38] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[37] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[36] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[35] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[34] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[33] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[32] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[31] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[30] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[29] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[28] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[27] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[35], w[36], offset); + w[62] = amd_bytealign_S (w[34], w[35], offset); + w[61] = amd_bytealign_S (w[33], w[34], offset); + w[60] = amd_bytealign_S (w[32], w[33], offset); + w[59] = amd_bytealign_S (w[31], w[32], offset); + w[58] = amd_bytealign_S (w[30], w[31], offset); + w[57] = amd_bytealign_S (w[29], w[30], offset); + w[56] = amd_bytealign_S (w[28], w[29], offset); + w[55] = amd_bytealign_S (w[27], w[28], offset); + w[54] = amd_bytealign_S (w[26], w[27], offset); + w[53] = amd_bytealign_S (w[25], w[26], offset); + w[52] = amd_bytealign_S (w[24], w[25], offset); + w[51] = amd_bytealign_S (w[23], w[24], offset); + w[50] = amd_bytealign_S (w[22], w[23], offset); + w[49] = amd_bytealign_S (w[21], w[22], offset); + w[48] = amd_bytealign_S (w[20], w[21], offset); + w[47] = amd_bytealign_S (w[19], w[20], offset); + w[46] = amd_bytealign_S (w[18], w[19], offset); + w[45] = amd_bytealign_S (w[17], w[18], offset); + w[44] = amd_bytealign_S (w[16], w[17], offset); + w[43] = amd_bytealign_S (w[15], w[16], offset); + w[42] = amd_bytealign_S (w[14], w[15], offset); + w[41] = amd_bytealign_S (w[13], w[14], offset); + w[40] = amd_bytealign_S (w[12], w[13], offset); + w[39] = amd_bytealign_S (w[11], w[12], offset); + w[38] = amd_bytealign_S (w[10], w[11], offset); + w[37] = amd_bytealign_S (w[ 9], w[10], offset); + w[36] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[35] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[34] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[33] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[32] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[31] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[30] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[29] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[28] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[27] = amd_bytealign_S ( 0, w[ 0], offset); w[26] = 0; w[25] = 0; w[24] = 0; @@ -28982,86 +46019,45 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 28: - w[63] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[62] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[61] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[60] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[59] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[58] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[57] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[56] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[55] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[54] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[53] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[52] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[51] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[50] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[49] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[48] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[47] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[46] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[45] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[44] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[43] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[42] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[41] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[40] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[39] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[38] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[37] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[36] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[35] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[34] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[33] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[32] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[31] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[30] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[29] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[28] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[34], w[35], offset); + w[62] = amd_bytealign_S (w[33], w[34], offset); + w[61] = amd_bytealign_S (w[32], w[33], offset); + w[60] = amd_bytealign_S (w[31], w[32], offset); + w[59] = amd_bytealign_S (w[30], w[31], offset); + w[58] = amd_bytealign_S (w[29], w[30], offset); + w[57] = amd_bytealign_S (w[28], w[29], offset); + w[56] = amd_bytealign_S (w[27], w[28], offset); + w[55] = amd_bytealign_S (w[26], w[27], offset); + w[54] = amd_bytealign_S (w[25], w[26], offset); + w[53] = amd_bytealign_S (w[24], w[25], offset); + w[52] = amd_bytealign_S (w[23], w[24], offset); + w[51] = amd_bytealign_S (w[22], w[23], offset); + w[50] = amd_bytealign_S (w[21], w[22], offset); + w[49] = amd_bytealign_S (w[20], w[21], offset); + w[48] = amd_bytealign_S (w[19], w[20], offset); + w[47] = amd_bytealign_S (w[18], w[19], offset); + w[46] = amd_bytealign_S (w[17], w[18], offset); + w[45] = amd_bytealign_S (w[16], w[17], offset); + w[44] = amd_bytealign_S (w[15], w[16], offset); + w[43] = amd_bytealign_S (w[14], w[15], offset); + w[42] = amd_bytealign_S (w[13], w[14], offset); + w[41] = amd_bytealign_S (w[12], w[13], offset); + w[40] = amd_bytealign_S (w[11], w[12], offset); + w[39] = amd_bytealign_S (w[10], w[11], offset); + w[38] = amd_bytealign_S (w[ 9], w[10], offset); + w[37] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[36] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[35] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[34] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[33] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[32] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[31] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[30] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[29] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[28] = amd_bytealign_S ( 0, w[ 0], offset); w[27] = 0; w[26] = 0; w[25] = 0; @@ -29091,84 +46087,44 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 29: - w[63] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[62] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[61] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[60] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[59] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[58] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[57] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[56] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[55] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[54] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[53] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[52] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[51] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[50] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[49] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[48] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[47] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[46] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[45] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[44] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[43] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[42] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[41] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[40] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[39] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[38] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[37] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[36] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[35] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[34] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[33] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[32] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[31] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[30] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[29] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[33], w[34], offset); + w[62] = amd_bytealign_S (w[32], w[33], offset); + w[61] = amd_bytealign_S (w[31], w[32], offset); + w[60] = amd_bytealign_S (w[30], w[31], offset); + w[59] = amd_bytealign_S (w[29], w[30], offset); + w[58] = amd_bytealign_S (w[28], w[29], offset); + w[57] = amd_bytealign_S (w[27], w[28], offset); + w[56] = amd_bytealign_S (w[26], w[27], offset); + w[55] = amd_bytealign_S (w[25], w[26], offset); + w[54] = amd_bytealign_S (w[24], w[25], offset); + w[53] = amd_bytealign_S (w[23], w[24], offset); + w[52] = amd_bytealign_S (w[22], w[23], offset); + w[51] = amd_bytealign_S (w[21], w[22], offset); + w[50] = amd_bytealign_S (w[20], w[21], offset); + w[49] = amd_bytealign_S (w[19], w[20], offset); + w[48] = amd_bytealign_S (w[18], w[19], offset); + w[47] = amd_bytealign_S (w[17], w[18], offset); + w[46] = amd_bytealign_S (w[16], w[17], offset); + w[45] = amd_bytealign_S (w[15], w[16], offset); + w[44] = amd_bytealign_S (w[14], w[15], offset); + w[43] = amd_bytealign_S (w[13], w[14], offset); + w[42] = amd_bytealign_S (w[12], w[13], offset); + w[41] = amd_bytealign_S (w[11], w[12], offset); + w[40] = amd_bytealign_S (w[10], w[11], offset); + w[39] = amd_bytealign_S (w[ 9], w[10], offset); + w[38] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[37] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[36] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[35] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[34] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[33] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[32] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[31] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[30] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[29] = amd_bytealign_S ( 0, w[ 0], offset); w[28] = 0; w[27] = 0; w[26] = 0; @@ -29199,82 +46155,43 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 30: - w[63] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[62] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[61] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[60] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[59] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[58] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[57] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[56] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[55] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[54] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[53] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[52] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[51] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[50] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[49] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[48] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[47] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[46] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[45] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[44] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[43] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[42] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[41] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[40] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[39] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[38] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[37] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[36] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[35] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[34] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[33] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[32] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[31] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[30] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[32], w[33], offset); + w[62] = amd_bytealign_S (w[31], w[32], offset); + w[61] = amd_bytealign_S (w[30], w[31], offset); + w[60] = amd_bytealign_S (w[29], w[30], offset); + w[59] = amd_bytealign_S (w[28], w[29], offset); + w[58] = amd_bytealign_S (w[27], w[28], offset); + w[57] = amd_bytealign_S (w[26], w[27], offset); + w[56] = amd_bytealign_S (w[25], w[26], offset); + w[55] = amd_bytealign_S (w[24], w[25], offset); + w[54] = amd_bytealign_S (w[23], w[24], offset); + w[53] = amd_bytealign_S (w[22], w[23], offset); + w[52] = amd_bytealign_S (w[21], w[22], offset); + w[51] = amd_bytealign_S (w[20], w[21], offset); + w[50] = amd_bytealign_S (w[19], w[20], offset); + w[49] = amd_bytealign_S (w[18], w[19], offset); + w[48] = amd_bytealign_S (w[17], w[18], offset); + w[47] = amd_bytealign_S (w[16], w[17], offset); + w[46] = amd_bytealign_S (w[15], w[16], offset); + w[45] = amd_bytealign_S (w[14], w[15], offset); + w[44] = amd_bytealign_S (w[13], w[14], offset); + w[43] = amd_bytealign_S (w[12], w[13], offset); + w[42] = amd_bytealign_S (w[11], w[12], offset); + w[41] = amd_bytealign_S (w[10], w[11], offset); + w[40] = amd_bytealign_S (w[ 9], w[10], offset); + w[39] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[38] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[37] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[36] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[35] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[34] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[33] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[32] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[31] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[30] = amd_bytealign_S ( 0, w[ 0], offset); w[29] = 0; w[28] = 0; w[27] = 0; @@ -29306,80 +46223,42 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 31: - w[63] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[62] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[61] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[60] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[59] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[58] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[57] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[56] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[55] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[54] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[53] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[52] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[51] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[50] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[49] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[48] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[47] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[46] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[45] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[44] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[43] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[42] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[41] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[40] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[39] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[38] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[37] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[36] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[35] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[34] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[33] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[32] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[31] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[31], w[32], offset); + w[62] = amd_bytealign_S (w[30], w[31], offset); + w[61] = amd_bytealign_S (w[29], w[30], offset); + w[60] = amd_bytealign_S (w[28], w[29], offset); + w[59] = amd_bytealign_S (w[27], w[28], offset); + w[58] = amd_bytealign_S (w[26], w[27], offset); + w[57] = amd_bytealign_S (w[25], w[26], offset); + w[56] = amd_bytealign_S (w[24], w[25], offset); + w[55] = amd_bytealign_S (w[23], w[24], offset); + w[54] = amd_bytealign_S (w[22], w[23], offset); + w[53] = amd_bytealign_S (w[21], w[22], offset); + w[52] = amd_bytealign_S (w[20], w[21], offset); + w[51] = amd_bytealign_S (w[19], w[20], offset); + w[50] = amd_bytealign_S (w[18], w[19], offset); + w[49] = amd_bytealign_S (w[17], w[18], offset); + w[48] = amd_bytealign_S (w[16], w[17], offset); + w[47] = amd_bytealign_S (w[15], w[16], offset); + w[46] = amd_bytealign_S (w[14], w[15], offset); + w[45] = amd_bytealign_S (w[13], w[14], offset); + w[44] = amd_bytealign_S (w[12], w[13], offset); + w[43] = amd_bytealign_S (w[11], w[12], offset); + w[42] = amd_bytealign_S (w[10], w[11], offset); + w[41] = amd_bytealign_S (w[ 9], w[10], offset); + w[40] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[39] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[38] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[37] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[36] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[35] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[34] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[33] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[32] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[31] = amd_bytealign_S ( 0, w[ 0], offset); w[30] = 0; w[29] = 0; w[28] = 0; @@ -29412,78 +46291,41 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 32: - w[63] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[62] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[61] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[60] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[59] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[58] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[57] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[56] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[55] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[54] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[53] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[52] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[51] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[50] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[49] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[48] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[47] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[46] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[45] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[44] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[43] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[42] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[41] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[40] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[39] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[38] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[37] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[36] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[35] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[34] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[33] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[32] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[30], w[31], offset); + w[62] = amd_bytealign_S (w[29], w[30], offset); + w[61] = amd_bytealign_S (w[28], w[29], offset); + w[60] = amd_bytealign_S (w[27], w[28], offset); + w[59] = amd_bytealign_S (w[26], w[27], offset); + w[58] = amd_bytealign_S (w[25], w[26], offset); + w[57] = amd_bytealign_S (w[24], w[25], offset); + w[56] = amd_bytealign_S (w[23], w[24], offset); + w[55] = amd_bytealign_S (w[22], w[23], offset); + w[54] = amd_bytealign_S (w[21], w[22], offset); + w[53] = amd_bytealign_S (w[20], w[21], offset); + w[52] = amd_bytealign_S (w[19], w[20], offset); + w[51] = amd_bytealign_S (w[18], w[19], offset); + w[50] = amd_bytealign_S (w[17], w[18], offset); + w[49] = amd_bytealign_S (w[16], w[17], offset); + w[48] = amd_bytealign_S (w[15], w[16], offset); + w[47] = amd_bytealign_S (w[14], w[15], offset); + w[46] = amd_bytealign_S (w[13], w[14], offset); + w[45] = amd_bytealign_S (w[12], w[13], offset); + w[44] = amd_bytealign_S (w[11], w[12], offset); + w[43] = amd_bytealign_S (w[10], w[11], offset); + w[42] = amd_bytealign_S (w[ 9], w[10], offset); + w[41] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[40] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[39] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[38] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[37] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[36] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[35] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[34] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[33] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[32] = amd_bytealign_S ( 0, w[ 0], offset); w[31] = 0; w[30] = 0; w[29] = 0; @@ -29517,76 +46359,40 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 33: - w[63] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[62] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[61] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[60] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[59] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[58] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[57] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[56] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[55] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[54] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[53] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[52] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[51] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[50] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[49] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[48] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[47] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[46] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[45] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[44] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[43] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[42] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[41] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[40] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[39] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[38] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[37] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[36] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[35] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[34] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[33] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[29], w[30], offset); + w[62] = amd_bytealign_S (w[28], w[29], offset); + w[61] = amd_bytealign_S (w[27], w[28], offset); + w[60] = amd_bytealign_S (w[26], w[27], offset); + w[59] = amd_bytealign_S (w[25], w[26], offset); + w[58] = amd_bytealign_S (w[24], w[25], offset); + w[57] = amd_bytealign_S (w[23], w[24], offset); + w[56] = amd_bytealign_S (w[22], w[23], offset); + w[55] = amd_bytealign_S (w[21], w[22], offset); + w[54] = amd_bytealign_S (w[20], w[21], offset); + w[53] = amd_bytealign_S (w[19], w[20], offset); + w[52] = amd_bytealign_S (w[18], w[19], offset); + w[51] = amd_bytealign_S (w[17], w[18], offset); + w[50] = amd_bytealign_S (w[16], w[17], offset); + w[49] = amd_bytealign_S (w[15], w[16], offset); + w[48] = amd_bytealign_S (w[14], w[15], offset); + w[47] = amd_bytealign_S (w[13], w[14], offset); + w[46] = amd_bytealign_S (w[12], w[13], offset); + w[45] = amd_bytealign_S (w[11], w[12], offset); + w[44] = amd_bytealign_S (w[10], w[11], offset); + w[43] = amd_bytealign_S (w[ 9], w[10], offset); + w[42] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[41] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[40] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[39] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[38] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[37] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[36] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[35] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[34] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[33] = amd_bytealign_S ( 0, w[ 0], offset); w[32] = 0; w[31] = 0; w[30] = 0; @@ -29621,74 +46427,39 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 34: - w[63] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[62] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[61] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[60] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[59] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[58] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[57] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[56] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[55] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[54] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[53] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[52] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[51] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[50] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[49] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[48] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[47] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[46] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[45] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[44] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[43] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[42] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[41] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[40] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[39] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[38] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[37] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[36] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[35] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[34] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[28], w[29], offset); + w[62] = amd_bytealign_S (w[27], w[28], offset); + w[61] = amd_bytealign_S (w[26], w[27], offset); + w[60] = amd_bytealign_S (w[25], w[26], offset); + w[59] = amd_bytealign_S (w[24], w[25], offset); + w[58] = amd_bytealign_S (w[23], w[24], offset); + w[57] = amd_bytealign_S (w[22], w[23], offset); + w[56] = amd_bytealign_S (w[21], w[22], offset); + w[55] = amd_bytealign_S (w[20], w[21], offset); + w[54] = amd_bytealign_S (w[19], w[20], offset); + w[53] = amd_bytealign_S (w[18], w[19], offset); + w[52] = amd_bytealign_S (w[17], w[18], offset); + w[51] = amd_bytealign_S (w[16], w[17], offset); + w[50] = amd_bytealign_S (w[15], w[16], offset); + w[49] = amd_bytealign_S (w[14], w[15], offset); + w[48] = amd_bytealign_S (w[13], w[14], offset); + w[47] = amd_bytealign_S (w[12], w[13], offset); + w[46] = amd_bytealign_S (w[11], w[12], offset); + w[45] = amd_bytealign_S (w[10], w[11], offset); + w[44] = amd_bytealign_S (w[ 9], w[10], offset); + w[43] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[42] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[41] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[40] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[39] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[38] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[37] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[36] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[35] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[34] = amd_bytealign_S ( 0, w[ 0], offset); w[33] = 0; w[32] = 0; w[31] = 0; @@ -29724,72 +46495,38 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 35: - w[63] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[62] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[61] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[60] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[59] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[58] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[57] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[56] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[55] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[54] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[53] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[52] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[51] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[50] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[49] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[48] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[47] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[46] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[45] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[44] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[43] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[42] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[41] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[40] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[39] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[38] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[37] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[36] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[35] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[27], w[28], offset); + w[62] = amd_bytealign_S (w[26], w[27], offset); + w[61] = amd_bytealign_S (w[25], w[26], offset); + w[60] = amd_bytealign_S (w[24], w[25], offset); + w[59] = amd_bytealign_S (w[23], w[24], offset); + w[58] = amd_bytealign_S (w[22], w[23], offset); + w[57] = amd_bytealign_S (w[21], w[22], offset); + w[56] = amd_bytealign_S (w[20], w[21], offset); + w[55] = amd_bytealign_S (w[19], w[20], offset); + w[54] = amd_bytealign_S (w[18], w[19], offset); + w[53] = amd_bytealign_S (w[17], w[18], offset); + w[52] = amd_bytealign_S (w[16], w[17], offset); + w[51] = amd_bytealign_S (w[15], w[16], offset); + w[50] = amd_bytealign_S (w[14], w[15], offset); + w[49] = amd_bytealign_S (w[13], w[14], offset); + w[48] = amd_bytealign_S (w[12], w[13], offset); + w[47] = amd_bytealign_S (w[11], w[12], offset); + w[46] = amd_bytealign_S (w[10], w[11], offset); + w[45] = amd_bytealign_S (w[ 9], w[10], offset); + w[44] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[43] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[42] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[41] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[40] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[39] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[38] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[37] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[36] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[35] = amd_bytealign_S ( 0, w[ 0], offset); w[34] = 0; w[33] = 0; w[32] = 0; @@ -29826,70 +46563,37 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 36: - w[63] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[62] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[61] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[60] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[59] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[58] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[57] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[56] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[55] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[54] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[53] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[52] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[51] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[50] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[49] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[48] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[47] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[46] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[45] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[44] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[43] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[42] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[41] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[40] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[39] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[38] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[37] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[36] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[26], w[27], offset); + w[62] = amd_bytealign_S (w[25], w[26], offset); + w[61] = amd_bytealign_S (w[24], w[25], offset); + w[60] = amd_bytealign_S (w[23], w[24], offset); + w[59] = amd_bytealign_S (w[22], w[23], offset); + w[58] = amd_bytealign_S (w[21], w[22], offset); + w[57] = amd_bytealign_S (w[20], w[21], offset); + w[56] = amd_bytealign_S (w[19], w[20], offset); + w[55] = amd_bytealign_S (w[18], w[19], offset); + w[54] = amd_bytealign_S (w[17], w[18], offset); + w[53] = amd_bytealign_S (w[16], w[17], offset); + w[52] = amd_bytealign_S (w[15], w[16], offset); + w[51] = amd_bytealign_S (w[14], w[15], offset); + w[50] = amd_bytealign_S (w[13], w[14], offset); + w[49] = amd_bytealign_S (w[12], w[13], offset); + w[48] = amd_bytealign_S (w[11], w[12], offset); + w[47] = amd_bytealign_S (w[10], w[11], offset); + w[46] = amd_bytealign_S (w[ 9], w[10], offset); + w[45] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[44] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[43] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[42] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[41] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[40] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[39] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[38] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[37] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[36] = amd_bytealign_S ( 0, w[ 0], offset); w[35] = 0; w[34] = 0; w[33] = 0; @@ -29927,68 +46631,36 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 37: - w[63] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[62] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[61] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[60] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[59] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[58] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[57] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[56] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[55] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[54] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[53] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[52] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[51] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[50] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[49] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[48] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[47] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[46] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[45] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[44] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[43] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[42] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[41] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[40] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[39] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[38] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[37] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[25], w[26], offset); + w[62] = amd_bytealign_S (w[24], w[25], offset); + w[61] = amd_bytealign_S (w[23], w[24], offset); + w[60] = amd_bytealign_S (w[22], w[23], offset); + w[59] = amd_bytealign_S (w[21], w[22], offset); + w[58] = amd_bytealign_S (w[20], w[21], offset); + w[57] = amd_bytealign_S (w[19], w[20], offset); + w[56] = amd_bytealign_S (w[18], w[19], offset); + w[55] = amd_bytealign_S (w[17], w[18], offset); + w[54] = amd_bytealign_S (w[16], w[17], offset); + w[53] = amd_bytealign_S (w[15], w[16], offset); + w[52] = amd_bytealign_S (w[14], w[15], offset); + w[51] = amd_bytealign_S (w[13], w[14], offset); + w[50] = amd_bytealign_S (w[12], w[13], offset); + w[49] = amd_bytealign_S (w[11], w[12], offset); + w[48] = amd_bytealign_S (w[10], w[11], offset); + w[47] = amd_bytealign_S (w[ 9], w[10], offset); + w[46] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[45] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[44] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[43] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[42] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[41] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[40] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[39] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[38] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[37] = amd_bytealign_S ( 0, w[ 0], offset); w[36] = 0; w[35] = 0; w[34] = 0; @@ -30027,66 +46699,35 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 38: - w[63] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[62] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[61] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[60] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[59] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[58] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[57] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[56] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[55] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[54] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[53] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[52] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[51] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[50] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[49] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[48] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[47] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[46] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[45] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[44] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[43] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[42] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[41] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[40] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[39] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[38] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[24], w[25], offset); + w[62] = amd_bytealign_S (w[23], w[24], offset); + w[61] = amd_bytealign_S (w[22], w[23], offset); + w[60] = amd_bytealign_S (w[21], w[22], offset); + w[59] = amd_bytealign_S (w[20], w[21], offset); + w[58] = amd_bytealign_S (w[19], w[20], offset); + w[57] = amd_bytealign_S (w[18], w[19], offset); + w[56] = amd_bytealign_S (w[17], w[18], offset); + w[55] = amd_bytealign_S (w[16], w[17], offset); + w[54] = amd_bytealign_S (w[15], w[16], offset); + w[53] = amd_bytealign_S (w[14], w[15], offset); + w[52] = amd_bytealign_S (w[13], w[14], offset); + w[51] = amd_bytealign_S (w[12], w[13], offset); + w[50] = amd_bytealign_S (w[11], w[12], offset); + w[49] = amd_bytealign_S (w[10], w[11], offset); + w[48] = amd_bytealign_S (w[ 9], w[10], offset); + w[47] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[46] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[45] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[44] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[43] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[42] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[41] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[40] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[39] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[38] = amd_bytealign_S ( 0, w[ 0], offset); w[37] = 0; w[36] = 0; w[35] = 0; @@ -30126,64 +46767,34 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 39: - w[63] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[62] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[61] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[60] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[59] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[58] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[57] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[56] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[55] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[54] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[53] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[52] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[51] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[50] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[49] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[48] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[47] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[46] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[45] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[44] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[43] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[42] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[41] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[40] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[39] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[23], w[24], offset); + w[62] = amd_bytealign_S (w[22], w[23], offset); + w[61] = amd_bytealign_S (w[21], w[22], offset); + w[60] = amd_bytealign_S (w[20], w[21], offset); + w[59] = amd_bytealign_S (w[19], w[20], offset); + w[58] = amd_bytealign_S (w[18], w[19], offset); + w[57] = amd_bytealign_S (w[17], w[18], offset); + w[56] = amd_bytealign_S (w[16], w[17], offset); + w[55] = amd_bytealign_S (w[15], w[16], offset); + w[54] = amd_bytealign_S (w[14], w[15], offset); + w[53] = amd_bytealign_S (w[13], w[14], offset); + w[52] = amd_bytealign_S (w[12], w[13], offset); + w[51] = amd_bytealign_S (w[11], w[12], offset); + w[50] = amd_bytealign_S (w[10], w[11], offset); + w[49] = amd_bytealign_S (w[ 9], w[10], offset); + w[48] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[47] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[46] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[45] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[44] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[43] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[42] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[41] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[40] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[39] = amd_bytealign_S ( 0, w[ 0], offset); w[38] = 0; w[37] = 0; w[36] = 0; @@ -30224,62 +46835,33 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 40: - w[63] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[62] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[61] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[60] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[59] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[58] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[57] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[56] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[55] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[54] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[53] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[52] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[51] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[50] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[49] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[48] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[47] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[46] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[45] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[44] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[43] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[42] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[41] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[40] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[22], w[23], offset); + w[62] = amd_bytealign_S (w[21], w[22], offset); + w[61] = amd_bytealign_S (w[20], w[21], offset); + w[60] = amd_bytealign_S (w[19], w[20], offset); + w[59] = amd_bytealign_S (w[18], w[19], offset); + w[58] = amd_bytealign_S (w[17], w[18], offset); + w[57] = amd_bytealign_S (w[16], w[17], offset); + w[56] = amd_bytealign_S (w[15], w[16], offset); + w[55] = amd_bytealign_S (w[14], w[15], offset); + w[54] = amd_bytealign_S (w[13], w[14], offset); + w[53] = amd_bytealign_S (w[12], w[13], offset); + w[52] = amd_bytealign_S (w[11], w[12], offset); + w[51] = amd_bytealign_S (w[10], w[11], offset); + w[50] = amd_bytealign_S (w[ 9], w[10], offset); + w[49] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[48] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[47] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[46] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[45] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[44] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[43] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[42] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[41] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[40] = amd_bytealign_S ( 0, w[ 0], offset); w[39] = 0; w[38] = 0; w[37] = 0; @@ -30321,60 +46903,32 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 41: - w[63] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[62] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[61] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[60] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[59] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[58] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[57] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[56] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[55] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[54] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[53] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[52] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[51] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[50] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[49] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[48] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[47] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[46] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[45] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[44] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[43] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[42] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[41] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[21], w[22], offset); + w[62] = amd_bytealign_S (w[20], w[21], offset); + w[61] = amd_bytealign_S (w[19], w[20], offset); + w[60] = amd_bytealign_S (w[18], w[19], offset); + w[59] = amd_bytealign_S (w[17], w[18], offset); + w[58] = amd_bytealign_S (w[16], w[17], offset); + w[57] = amd_bytealign_S (w[15], w[16], offset); + w[56] = amd_bytealign_S (w[14], w[15], offset); + w[55] = amd_bytealign_S (w[13], w[14], offset); + w[54] = amd_bytealign_S (w[12], w[13], offset); + w[53] = amd_bytealign_S (w[11], w[12], offset); + w[52] = amd_bytealign_S (w[10], w[11], offset); + w[51] = amd_bytealign_S (w[ 9], w[10], offset); + w[50] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[49] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[48] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[47] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[46] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[45] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[44] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[43] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[42] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[41] = amd_bytealign_S ( 0, w[ 0], offset); w[40] = 0; w[39] = 0; w[38] = 0; @@ -30417,58 +46971,31 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 42: - w[63] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[62] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[61] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[60] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[59] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[58] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[57] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[56] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[55] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[54] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[53] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[52] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[51] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[50] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[49] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[48] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[47] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[46] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[45] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[44] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[43] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[42] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[20], w[21], offset); + w[62] = amd_bytealign_S (w[19], w[20], offset); + w[61] = amd_bytealign_S (w[18], w[19], offset); + w[60] = amd_bytealign_S (w[17], w[18], offset); + w[59] = amd_bytealign_S (w[16], w[17], offset); + w[58] = amd_bytealign_S (w[15], w[16], offset); + w[57] = amd_bytealign_S (w[14], w[15], offset); + w[56] = amd_bytealign_S (w[13], w[14], offset); + w[55] = amd_bytealign_S (w[12], w[13], offset); + w[54] = amd_bytealign_S (w[11], w[12], offset); + w[53] = amd_bytealign_S (w[10], w[11], offset); + w[52] = amd_bytealign_S (w[ 9], w[10], offset); + w[51] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[50] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[49] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[48] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[47] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[46] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[45] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[44] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[43] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[42] = amd_bytealign_S ( 0, w[ 0], offset); w[41] = 0; w[40] = 0; w[39] = 0; @@ -30512,56 +47039,30 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 43: - w[63] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[62] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[61] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[60] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[59] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[58] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[57] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[56] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[55] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[54] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[53] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[52] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[51] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[50] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[49] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[48] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[47] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[46] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[45] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[44] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[43] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[19], w[20], offset); + w[62] = amd_bytealign_S (w[18], w[19], offset); + w[61] = amd_bytealign_S (w[17], w[18], offset); + w[60] = amd_bytealign_S (w[16], w[17], offset); + w[59] = amd_bytealign_S (w[15], w[16], offset); + w[58] = amd_bytealign_S (w[14], w[15], offset); + w[57] = amd_bytealign_S (w[13], w[14], offset); + w[56] = amd_bytealign_S (w[12], w[13], offset); + w[55] = amd_bytealign_S (w[11], w[12], offset); + w[54] = amd_bytealign_S (w[10], w[11], offset); + w[53] = amd_bytealign_S (w[ 9], w[10], offset); + w[52] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[51] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[50] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[49] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[48] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[47] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[46] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[45] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[44] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[43] = amd_bytealign_S ( 0, w[ 0], offset); w[42] = 0; w[41] = 0; w[40] = 0; @@ -30606,54 +47107,29 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 44: - w[63] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[62] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[61] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[60] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[59] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[58] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[57] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[56] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[55] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[54] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[53] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[52] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[51] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[50] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[49] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[48] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[47] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[46] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[45] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[44] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[18], w[19], offset); + w[62] = amd_bytealign_S (w[17], w[18], offset); + w[61] = amd_bytealign_S (w[16], w[17], offset); + w[60] = amd_bytealign_S (w[15], w[16], offset); + w[59] = amd_bytealign_S (w[14], w[15], offset); + w[58] = amd_bytealign_S (w[13], w[14], offset); + w[57] = amd_bytealign_S (w[12], w[13], offset); + w[56] = amd_bytealign_S (w[11], w[12], offset); + w[55] = amd_bytealign_S (w[10], w[11], offset); + w[54] = amd_bytealign_S (w[ 9], w[10], offset); + w[53] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[52] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[51] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[50] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[49] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[48] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[47] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[46] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[45] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[44] = amd_bytealign_S ( 0, w[ 0], offset); w[43] = 0; w[42] = 0; w[41] = 0; @@ -30699,52 +47175,28 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 45: - w[63] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[62] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[61] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[60] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[59] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[58] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[57] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[56] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[55] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[54] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[53] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[52] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[51] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[50] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[49] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[48] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[47] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[46] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[45] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[17], w[18], offset); + w[62] = amd_bytealign_S (w[16], w[17], offset); + w[61] = amd_bytealign_S (w[15], w[16], offset); + w[60] = amd_bytealign_S (w[14], w[15], offset); + w[59] = amd_bytealign_S (w[13], w[14], offset); + w[58] = amd_bytealign_S (w[12], w[13], offset); + w[57] = amd_bytealign_S (w[11], w[12], offset); + w[56] = amd_bytealign_S (w[10], w[11], offset); + w[55] = amd_bytealign_S (w[ 9], w[10], offset); + w[54] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[53] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[52] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[51] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[50] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[49] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[48] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[47] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[46] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[45] = amd_bytealign_S ( 0, w[ 0], offset); w[44] = 0; w[43] = 0; w[42] = 0; @@ -30791,50 +47243,27 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 46: - w[63] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[62] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[61] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[60] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[59] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[58] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[57] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[56] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[55] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[54] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[53] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[52] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[51] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[50] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[49] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[48] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[47] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[46] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[16], w[17], offset); + w[62] = amd_bytealign_S (w[15], w[16], offset); + w[61] = amd_bytealign_S (w[14], w[15], offset); + w[60] = amd_bytealign_S (w[13], w[14], offset); + w[59] = amd_bytealign_S (w[12], w[13], offset); + w[58] = amd_bytealign_S (w[11], w[12], offset); + w[57] = amd_bytealign_S (w[10], w[11], offset); + w[56] = amd_bytealign_S (w[ 9], w[10], offset); + w[55] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[54] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[53] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[52] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[51] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[50] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[49] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[48] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[47] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[46] = amd_bytealign_S ( 0, w[ 0], offset); w[45] = 0; w[44] = 0; w[43] = 0; @@ -30882,48 +47311,26 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 47: - w[63] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[62] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[61] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[60] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[59] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[58] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[57] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[56] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[55] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[54] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[53] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[52] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[51] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[50] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[49] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[48] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[47] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[15], w[16], offset); + w[62] = amd_bytealign_S (w[14], w[15], offset); + w[61] = amd_bytealign_S (w[13], w[14], offset); + w[60] = amd_bytealign_S (w[12], w[13], offset); + w[59] = amd_bytealign_S (w[11], w[12], offset); + w[58] = amd_bytealign_S (w[10], w[11], offset); + w[57] = amd_bytealign_S (w[ 9], w[10], offset); + w[56] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[55] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[54] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[53] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[52] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[51] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[50] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[49] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[48] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[47] = amd_bytealign_S ( 0, w[ 0], offset); w[46] = 0; w[45] = 0; w[44] = 0; @@ -30972,46 +47379,25 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 48: - w[63] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[62] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[61] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[60] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[59] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[58] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[57] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[56] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[55] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[54] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[53] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[52] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[51] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[50] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[49] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[48] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[14], w[15], offset); + w[62] = amd_bytealign_S (w[13], w[14], offset); + w[61] = amd_bytealign_S (w[12], w[13], offset); + w[60] = amd_bytealign_S (w[11], w[12], offset); + w[59] = amd_bytealign_S (w[10], w[11], offset); + w[58] = amd_bytealign_S (w[ 9], w[10], offset); + w[57] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[56] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[55] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[54] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[53] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[52] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[51] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[50] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[49] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[48] = amd_bytealign_S ( 0, w[ 0], offset); w[47] = 0; w[46] = 0; w[45] = 0; @@ -31061,44 +47447,24 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 49: - w[63] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[62] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[61] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[60] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[59] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[58] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[57] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[56] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[55] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[54] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[53] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[52] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[51] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[50] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[49] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[13], w[14], offset); + w[62] = amd_bytealign_S (w[12], w[13], offset); + w[61] = amd_bytealign_S (w[11], w[12], offset); + w[60] = amd_bytealign_S (w[10], w[11], offset); + w[59] = amd_bytealign_S (w[ 9], w[10], offset); + w[58] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[57] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[56] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[55] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[54] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[53] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[52] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[51] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[50] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[49] = amd_bytealign_S ( 0, w[ 0], offset); w[48] = 0; w[47] = 0; w[46] = 0; @@ -31149,42 +47515,23 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 50: - w[63] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[62] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[61] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[60] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[59] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[58] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[57] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[56] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[55] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[54] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[53] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[52] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[51] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[50] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[12], w[13], offset); + w[62] = amd_bytealign_S (w[11], w[12], offset); + w[61] = amd_bytealign_S (w[10], w[11], offset); + w[60] = amd_bytealign_S (w[ 9], w[10], offset); + w[59] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[58] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[57] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[56] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[55] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[54] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[53] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[52] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[51] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[50] = amd_bytealign_S ( 0, w[ 0], offset); w[49] = 0; w[48] = 0; w[47] = 0; @@ -31236,40 +47583,22 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 51: - w[63] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[62] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[61] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[60] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[59] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[58] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[57] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[56] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[55] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[54] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[53] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[52] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[51] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[11], w[12], offset); + w[62] = amd_bytealign_S (w[10], w[11], offset); + w[61] = amd_bytealign_S (w[ 9], w[10], offset); + w[60] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[59] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[58] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[57] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[56] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[55] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[54] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[53] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[52] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[51] = amd_bytealign_S ( 0, w[ 0], offset); w[50] = 0; w[49] = 0; w[48] = 0; @@ -31322,38 +47651,21 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 52: - w[63] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[62] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[61] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[60] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[59] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[58] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[57] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[56] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[55] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[54] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[53] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[52] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[10], w[11], offset); + w[62] = amd_bytealign_S (w[ 9], w[10], offset); + w[61] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[60] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[59] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[58] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[57] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[56] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[55] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[54] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[53] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[52] = amd_bytealign_S ( 0, w[ 0], offset); w[51] = 0; w[50] = 0; w[49] = 0; @@ -31407,36 +47719,20 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 53: - w[63] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[62] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[61] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[60] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[59] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[58] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[57] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[56] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[55] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[54] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[53] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 9], w[10], offset); + w[62] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[61] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[60] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[59] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[58] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[57] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[56] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[55] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[54] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[53] = amd_bytealign_S ( 0, w[ 0], offset); w[52] = 0; w[51] = 0; w[50] = 0; @@ -31491,34 +47787,19 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 54: - w[63] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[62] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[61] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[60] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[59] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[58] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[57] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[56] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[55] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[54] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[62] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[61] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[60] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[59] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[58] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[57] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[56] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[55] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[54] = amd_bytealign_S ( 0, w[ 0], offset); w[53] = 0; w[52] = 0; w[51] = 0; @@ -31574,32 +47855,18 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 55: - w[63] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[62] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[61] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[60] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[59] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[58] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[57] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[56] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[55] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[62] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[61] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[60] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[59] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[58] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[57] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[56] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[55] = amd_bytealign_S ( 0, w[ 0], offset); w[54] = 0; w[53] = 0; w[52] = 0; @@ -31656,30 +47923,17 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 56: - w[63] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[62] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[61] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[60] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[59] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[58] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[57] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[56] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[62] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[61] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[60] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[59] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[58] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[57] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[56] = amd_bytealign_S ( 0, w[ 0], offset); w[55] = 0; w[54] = 0; w[53] = 0; @@ -31737,28 +47991,16 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 57: - w[63] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[62] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[61] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[60] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[59] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[58] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[57] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[62] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[61] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[60] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[59] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[58] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[57] = amd_bytealign_S ( 0, w[ 0], offset); w[56] = 0; w[55] = 0; w[54] = 0; @@ -31817,26 +48059,15 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 58: - w[63] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[62] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[61] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[60] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[59] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[58] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[62] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[61] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[60] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[59] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[58] = amd_bytealign_S ( 0, w[ 0], offset); w[57] = 0; w[56] = 0; w[55] = 0; @@ -31896,24 +48127,14 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 59: - w[63] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[62] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[61] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[60] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[59] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[62] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[61] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[60] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[59] = amd_bytealign_S ( 0, w[ 0], offset); w[58] = 0; w[57] = 0; w[56] = 0; @@ -31974,22 +48195,13 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 60: - w[63] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[62] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[61] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[60] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[62] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[61] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[60] = amd_bytealign_S ( 0, w[ 0], offset); w[59] = 0; w[58] = 0; w[57] = 0; @@ -32051,20 +48263,12 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 61: - w[63] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[62] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[61] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[62] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[61] = amd_bytealign_S ( 0, w[ 0], offset); w[60] = 0; w[59] = 0; w[58] = 0; @@ -32127,18 +48331,11 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 62: - w[63] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[62] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[62] = amd_bytealign_S ( 0, w[ 0], offset); w[61] = 0; w[60] = 0; w[59] = 0; @@ -32202,16 +48399,10 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[62] = w[63]; - w[63] = 0; - } - break; case 63: - w[63] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S ( 0, w[ 0], offset); w[62] = 0; w[61] = 0; w[60] = 0; @@ -32276,13 +48467,12 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[63] = 0; - } - break; } + + #pragma unroll + for (int i = 0; i < 64; i++) w[i] = swap32_S (w[i]); + #endif #ifdef IS_NV diff --git a/OpenCL/inc_rp.cl b/OpenCL/inc_rp.cl index 170ec5385..74ca84cc2 100644 --- a/OpenCL/inc_rp.cl +++ b/OpenCL/inc_rp.cl @@ -760,7 +760,6 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0 const int offset_minus_4 = 4 - offset_mod_4; - #if defined IS_AMD || defined IS_GENERIC u32 s0 = 0; u32 s1 = 0; u32 s2 = 0; @@ -769,64 +768,69 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0 u32 s5 = 0; u32 s6 = 0; u32 s7 = 0; - u32 s8 = 0; + + #if defined IS_AMD || defined IS_GENERIC + const u32 src_r00 = swap32_S (src_r0[0]); + const u32 src_r01 = swap32_S (src_r0[1]); + const u32 src_r02 = swap32_S (src_r0[2]); + const u32 src_r03 = swap32_S (src_r0[3]); + const u32 src_r10 = swap32_S (src_r1[0]); + const u32 src_r11 = swap32_S (src_r1[1]); + const u32 src_r12 = swap32_S (src_r1[2]); + const u32 src_r13 = swap32_S (src_r1[3]); switch (offset / 4) { case 0: - s8 = amd_bytealign_S ( 0, src_r1[3], offset_minus_4); - s7 = amd_bytealign_S (src_r1[3], src_r1[2], offset_minus_4); - s6 = amd_bytealign_S (src_r1[2], src_r1[1], offset_minus_4); - s5 = amd_bytealign_S (src_r1[1], src_r1[0], offset_minus_4); - s4 = amd_bytealign_S (src_r1[0], src_r0[3], offset_minus_4); - s3 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4); - s2 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4); - s1 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4); - s0 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S (src_r12, src_r13, offset); + s6 = amd_bytealign_S (src_r11, src_r12, offset); + s5 = amd_bytealign_S (src_r10, src_r11, offset); + s4 = amd_bytealign_S (src_r03, src_r10, offset); + s3 = amd_bytealign_S (src_r02, src_r03, offset); + s2 = amd_bytealign_S (src_r01, src_r02, offset); + s1 = amd_bytealign_S (src_r00, src_r01, offset); + s0 = amd_bytealign_S ( 0, src_r00, offset); break; case 1: - s8 = amd_bytealign_S ( 0, src_r1[2], offset_minus_4); - s7 = amd_bytealign_S (src_r1[2], src_r1[1], offset_minus_4); - s6 = amd_bytealign_S (src_r1[1], src_r1[0], offset_minus_4); - s5 = amd_bytealign_S (src_r1[0], src_r0[3], offset_minus_4); - s4 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4); - s3 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4); - s2 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4); - s1 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S (src_r11, src_r12, offset); + s6 = amd_bytealign_S (src_r10, src_r11, offset); + s5 = amd_bytealign_S (src_r03, src_r10, offset); + s4 = amd_bytealign_S (src_r02, src_r03, offset); + s3 = amd_bytealign_S (src_r01, src_r02, offset); + s2 = amd_bytealign_S (src_r00, src_r01, offset); + s1 = amd_bytealign_S ( 0, src_r00, offset); s0 = 0; break; case 2: - s8 = amd_bytealign_S ( 0, src_r1[1], offset_minus_4); - s7 = amd_bytealign_S (src_r1[1], src_r1[0], offset_minus_4); - s6 = amd_bytealign_S (src_r1[0], src_r0[3], offset_minus_4); - s5 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4); - s4 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4); - s3 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4); - s2 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S (src_r10, src_r11, offset); + s6 = amd_bytealign_S (src_r03, src_r10, offset); + s5 = amd_bytealign_S (src_r02, src_r03, offset); + s4 = amd_bytealign_S (src_r01, src_r02, offset); + s3 = amd_bytealign_S (src_r00, src_r01, offset); + s2 = amd_bytealign_S ( 0, src_r00, offset); s1 = 0; s0 = 0; break; case 3: - s8 = amd_bytealign_S ( 0, src_r1[0], offset_minus_4); - s7 = amd_bytealign_S (src_r1[0], src_r0[3], offset_minus_4); - s6 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4); - s5 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4); - s4 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4); - s3 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S (src_r03, src_r10, offset); + s6 = amd_bytealign_S (src_r02, src_r03, offset); + s5 = amd_bytealign_S (src_r01, src_r02, offset); + s4 = amd_bytealign_S (src_r00, src_r01, offset); + s3 = amd_bytealign_S ( 0, src_r00, offset); s2 = 0; s1 = 0; s0 = 0; + break; case 4: - s8 = amd_bytealign_S ( 0, src_r0[3], offset_minus_4); - s7 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4); - s6 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4); - s5 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4); - s4 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S (src_r02, src_r03, offset); + s6 = amd_bytealign_S (src_r01, src_r02, offset); + s5 = amd_bytealign_S (src_r00, src_r01, offset); + s4 = amd_bytealign_S ( 0, src_r00, offset); s3 = 0; s2 = 0; s1 = 0; @@ -834,10 +838,9 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0 break; case 5: - s8 = amd_bytealign_S ( 0, src_r0[2], offset_minus_4); - s7 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4); - s6 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4); - s5 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S (src_r01, src_r02, offset); + s6 = amd_bytealign_S (src_r00, src_r01, offset); + s5 = amd_bytealign_S ( 0, src_r00, offset); s4 = 0; s3 = 0; s2 = 0; @@ -846,9 +849,8 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0 break; case 6: - s8 = amd_bytealign_S ( 0, src_r0[1], offset_minus_4); - s7 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4); - s6 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S (src_r00, src_r01, offset); + s6 = amd_bytealign_S ( 0, src_r00, offset); s5 = 0; s4 = 0; s3 = 0; @@ -858,8 +860,7 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0 break; case 7: - s8 = amd_bytealign_S ( 0, src_r0[0], offset_minus_4); - s7 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S ( 0, src_r00, offset); s6 = 0; s5 = 0; s4 = 0; @@ -870,83 +871,69 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0 break; } - if (offset_mod_4 == 0) - { - buf0[0] = src_l0[0] | s1; - buf0[1] = src_l0[1] | s2; - buf0[2] = src_l0[2] | s3; - buf0[3] = src_l0[3] | s4; - buf1[0] = src_l1[0] | s5; - buf1[1] = src_l1[1] | s6; - buf1[2] = src_l1[2] | s7; - buf1[3] = src_l1[3] | s8; - } - else - { - buf0[0] = src_l0[0] | s0; - buf0[1] = src_l0[1] | s1; - buf0[2] = src_l0[2] | s2; - buf0[3] = src_l0[3] | s3; - buf1[0] = src_l1[0] | s4; - buf1[1] = src_l1[1] | s5; - buf1[2] = src_l1[2] | s6; - buf1[3] = src_l1[3] | s7; - } + s0 = swap32_S (s0); + s1 = swap32_S (s1); + s2 = swap32_S (s2); + s3 = swap32_S (s3); + s4 = swap32_S (s4); + s5 = swap32_S (s5); + s6 = swap32_S (s6); + s7 = swap32_S (s7); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - u32 s0 = 0; - u32 s1 = 0; - u32 s2 = 0; - u32 s3 = 0; - u32 s4 = 0; - u32 s5 = 0; - u32 s6 = 0; - u32 s7 = 0; + const u32 src_r00 = src_r0[0]; + const u32 src_r01 = src_r0[1]; + const u32 src_r02 = src_r0[2]; + const u32 src_r03 = src_r0[3]; + const u32 src_r10 = src_r1[0]; + const u32 src_r11 = src_r1[1]; + const u32 src_r12 = src_r1[2]; + const u32 src_r13 = src_r1[3]; switch (offset / 4) { case 0: - s7 = __byte_perm_S (src_r1[2], src_r1[3], selector); - s6 = __byte_perm_S (src_r1[1], src_r1[2], selector); - s5 = __byte_perm_S (src_r1[0], src_r1[1], selector); - s4 = __byte_perm_S (src_r0[3], src_r1[0], selector); - s3 = __byte_perm_S (src_r0[2], src_r0[3], selector); - s2 = __byte_perm_S (src_r0[1], src_r0[2], selector); - s1 = __byte_perm_S (src_r0[0], src_r0[1], selector); - s0 = __byte_perm_S ( 0, src_r0[0], selector); + s7 = __byte_perm_S (src_r12, src_r13, selector); + s6 = __byte_perm_S (src_r11, src_r12, selector); + s5 = __byte_perm_S (src_r10, src_r11, selector); + s4 = __byte_perm_S (src_r03, src_r10, selector); + s3 = __byte_perm_S (src_r02, src_r03, selector); + s2 = __byte_perm_S (src_r01, src_r02, selector); + s1 = __byte_perm_S (src_r00, src_r01, selector); + s0 = __byte_perm_S ( 0, src_r00, selector); break; case 1: - s7 = __byte_perm_S (src_r1[1], src_r1[2], selector); - s6 = __byte_perm_S (src_r1[0], src_r1[1], selector); - s5 = __byte_perm_S (src_r0[3], src_r1[0], selector); - s4 = __byte_perm_S (src_r0[2], src_r0[3], selector); - s3 = __byte_perm_S (src_r0[1], src_r0[2], selector); - s2 = __byte_perm_S (src_r0[0], src_r0[1], selector); - s1 = __byte_perm_S ( 0, src_r0[0], selector); + s7 = __byte_perm_S (src_r11, src_r12, selector); + s6 = __byte_perm_S (src_r10, src_r11, selector); + s5 = __byte_perm_S (src_r03, src_r10, selector); + s4 = __byte_perm_S (src_r02, src_r03, selector); + s3 = __byte_perm_S (src_r01, src_r02, selector); + s2 = __byte_perm_S (src_r00, src_r01, selector); + s1 = __byte_perm_S ( 0, src_r00, selector); s0 = 0; break; case 2: - s7 = __byte_perm_S (src_r1[0], src_r1[1], selector); - s6 = __byte_perm_S (src_r0[3], src_r1[0], selector); - s5 = __byte_perm_S (src_r0[2], src_r0[3], selector); - s4 = __byte_perm_S (src_r0[1], src_r0[2], selector); - s3 = __byte_perm_S (src_r0[0], src_r0[1], selector); - s2 = __byte_perm_S ( 0, src_r0[0], selector); + s7 = __byte_perm_S (src_r10, src_r11, selector); + s6 = __byte_perm_S (src_r03, src_r10, selector); + s5 = __byte_perm_S (src_r02, src_r03, selector); + s4 = __byte_perm_S (src_r01, src_r02, selector); + s3 = __byte_perm_S (src_r00, src_r01, selector); + s2 = __byte_perm_S ( 0, src_r00, selector); s1 = 0; s0 = 0; break; case 3: - s7 = __byte_perm_S (src_r0[3], src_r1[0], selector); - s6 = __byte_perm_S (src_r0[2], src_r0[3], selector); - s5 = __byte_perm_S (src_r0[1], src_r0[2], selector); - s4 = __byte_perm_S (src_r0[0], src_r0[1], selector); - s3 = __byte_perm_S ( 0, src_r0[0], selector); + s7 = __byte_perm_S (src_r03, src_r10, selector); + s6 = __byte_perm_S (src_r02, src_r03, selector); + s5 = __byte_perm_S (src_r01, src_r02, selector); + s4 = __byte_perm_S (src_r00, src_r01, selector); + s3 = __byte_perm_S ( 0, src_r00, selector); s2 = 0; s1 = 0; s0 = 0; @@ -954,10 +941,10 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0 break; case 4: - s7 = __byte_perm_S (src_r0[2], src_r0[3], selector); - s6 = __byte_perm_S (src_r0[1], src_r0[2], selector); - s5 = __byte_perm_S (src_r0[0], src_r0[1], selector); - s4 = __byte_perm_S ( 0, src_r0[0], selector); + s7 = __byte_perm_S (src_r02, src_r03, selector); + s6 = __byte_perm_S (src_r01, src_r02, selector); + s5 = __byte_perm_S (src_r00, src_r01, selector); + s4 = __byte_perm_S ( 0, src_r00, selector); s3 = 0; s2 = 0; s1 = 0; @@ -965,9 +952,9 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0 break; case 5: - s7 = __byte_perm_S (src_r0[1], src_r0[2], selector); - s6 = __byte_perm_S (src_r0[0], src_r0[1], selector); - s5 = __byte_perm_S ( 0, src_r0[0], selector); + s7 = __byte_perm_S (src_r01, src_r02, selector); + s6 = __byte_perm_S (src_r00, src_r01, selector); + s5 = __byte_perm_S ( 0, src_r00, selector); s4 = 0; s3 = 0; s2 = 0; @@ -976,8 +963,8 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0 break; case 6: - s7 = __byte_perm_S (src_r0[0], src_r0[1], selector); - s6 = __byte_perm_S ( 0, src_r0[0], selector); + s7 = __byte_perm_S (src_r00, src_r01, selector); + s6 = __byte_perm_S ( 0, src_r00, selector); s5 = 0; s4 = 0; s3 = 0; @@ -987,7 +974,7 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0 break; case 7: - s7 = __byte_perm_S ( 0, src_r0[0], selector); + s7 = __byte_perm_S ( 0, src_r00, selector); s6 = 0; s5 = 0; s4 = 0; @@ -997,6 +984,7 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0 s0 = 0; break; } + #endif buf0[0] = src_l0[0] | s0; buf0[1] = src_l0[1] | s1; @@ -1006,8 +994,6 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0 buf1[1] = src_l1[1] | s5; buf1[2] = src_l1[2] | s6; buf1[3] = src_l1[3] | s7; - - #endif } void reverse_block (u32 in0[4], u32 in1[4], u32 out0[4], u32 out1[4], const u32 len) diff --git a/OpenCL/m00500-optimized.cl b/OpenCL/m00500-optimized.cl index cb89b0b7b..5ccb727aa 100644 --- a/OpenCL/m00500-optimized.cl +++ b/OpenCL/m00500-optimized.cl @@ -10,110 +10,13 @@ #include "inc_hash_functions.cl" #include "inc_types.cl" #include "inc_common.cl" +#include "inc_hash_md5.cl" #define COMPARE_S "inc_comp_single.cl" #define COMPARE_M "inc_comp_multi.cl" #define md5crypt_magic 0x00243124u -void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[4]) { u32 tmp0; @@ -127,30 +30,37 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, 0, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, 0, selector); #endif const u32 div = offset / 4; @@ -233,30 +143,39 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0x80, append[3], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); + u32 in4 = 0x80000000; - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0x80; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, in4, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0x80, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + u32 in4 = 0x80; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, in4, selector); #endif const u32 div = offset / 4; @@ -337,24 +256,27 @@ void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign ( 0, append[1], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, 0, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, 0, selector); #endif const u32 div = offset / 4; diff --git a/OpenCL/m01600-optimized.cl b/OpenCL/m01600-optimized.cl index d624b4678..af6203263 100644 --- a/OpenCL/m01600-optimized.cl +++ b/OpenCL/m01600-optimized.cl @@ -8,6 +8,7 @@ #include "inc_hash_functions.cl" #include "inc_types.cl" #include "inc_common.cl" +#include "inc_hash_md5.cl" #define COMPARE_S "inc_comp_single.cl" #define COMPARE_M "inc_comp_multi.cl" @@ -15,104 +16,6 @@ #define md5apr1_magic0 0x72706124u #define md5apr1_magic1 0x00002431u -void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[4]) { u32 tmp0; @@ -126,30 +29,37 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, 0, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, 0, selector); #endif const u32 div = offset / 4; @@ -232,30 +142,39 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0x80, append[3], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); + u32 in4 = 0x80000000; - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0x80; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, in4, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0x80, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + u32 in4 = 0x80; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, in4, selector); #endif const u32 div = offset / 4; @@ -336,24 +255,27 @@ void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign ( 0, append[1], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, 0, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, 0, selector); #endif const u32 div = offset / 4; diff --git a/OpenCL/m02810_a3.cl b/OpenCL/m02810_a3.cl index 48fa8911f..f46abd13c 100644 --- a/OpenCL/m02810_a3.cl +++ b/OpenCL/m02810_a3.cl @@ -62,7 +62,7 @@ __kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 pw_lenv = ceil ((float) pw_len / 4); - u32 w[64] = { 0 }; + u32x w[64] = { 0 }; for (int idx = 0; idx < pw_lenv; idx++) { @@ -88,13 +88,13 @@ __kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rule * loop */ - u32 w0l = w[0]; + u32x w0l = w[0]; for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos / VECT_SIZE]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0lr = w0l | w0r; + const u32x w0lr = w0l | w0r; w[0] = w0lr; @@ -106,10 +106,10 @@ __kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rule md5_final_vector (&ctx0); - const u32 a = ctx0.h[0]; - const u32 b = ctx0.h[1]; - const u32 c = ctx0.h[2]; - const u32 d = ctx0.h[3]; + const u32x a = ctx0.h[0]; + const u32x b = ctx0.h[1]; + const u32x c = ctx0.h[2]; + const u32x d = ctx0.h[3]; md5_ctx_vector_t ctx; @@ -161,10 +161,10 @@ __kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rule md5_transform_vector (ctx.w0, ctx.w1, ctx.w2, ctx.w3, ctx.h); - const u32 r0 = ctx.h[DGST_R0]; - const u32 r1 = ctx.h[DGST_R1]; - const u32 r2 = ctx.h[DGST_R2]; - const u32 r3 = ctx.h[DGST_R3]; + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; COMPARE_M_SIMD (r0, r1, r2, r3); } @@ -219,7 +219,7 @@ __kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 pw_lenv = ceil ((float) pw_len / 4); - u32 w[64] = { 0 }; + u32x w[64] = { 0 }; for (int idx = 0; idx < pw_lenv; idx++) { @@ -245,13 +245,13 @@ __kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rule * loop */ - u32 w0l = w[0]; + u32x w0l = w[0]; for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos / VECT_SIZE]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0lr = w0l | w0r; + const u32x w0lr = w0l | w0r; w[0] = w0lr; @@ -263,10 +263,10 @@ __kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rule md5_final_vector (&ctx0); - const u32 a = ctx0.h[0]; - const u32 b = ctx0.h[1]; - const u32 c = ctx0.h[2]; - const u32 d = ctx0.h[3]; + const u32x a = ctx0.h[0]; + const u32x b = ctx0.h[1]; + const u32x c = ctx0.h[2]; + const u32x d = ctx0.h[3]; md5_ctx_vector_t ctx; @@ -318,10 +318,10 @@ __kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rule md5_transform_vector (ctx.w0, ctx.w1, ctx.w2, ctx.w3, ctx.h); - const u32 r0 = ctx.h[DGST_R0]; - const u32 r1 = ctx.h[DGST_R1]; - const u32 r2 = ctx.h[DGST_R2]; - const u32 r3 = ctx.h[DGST_R3]; + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; COMPARE_S_SIMD (r0, r1, r2, r3); } diff --git a/OpenCL/m05800-optimized.cl b/OpenCL/m05800-optimized.cl index 774b97327..760557daa 100644 --- a/OpenCL/m05800-optimized.cl +++ b/OpenCL/m05800-optimized.cl @@ -8,6 +8,7 @@ #include "inc_hash_functions.cl" #include "inc_types.cl" #include "inc_common.cl" +#include "inc_hash_sha1.cl" #define COMPARE_S "inc_comp_single.cl" #define COMPARE_M "inc_comp_multi.cl" @@ -2116,33 +2117,42 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3 const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign (append[4], append[3], offset_minus_4); - tmp5 = amd_bytealign ( 0, append[4], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); + u32 in4 = swap32_S (append[4]); - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = tmp5; - tmp5 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, in4, offset); + tmp5 = amd_bytealign (in4, 0, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); + tmp5 = swap32_S (tmp5); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], append[4], selector); - tmp5 = __byte_perm (append[4], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + u32 in4 = append[4]; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, in4, selector); + tmp5 = __byte_perm (in4, 0, selector); #endif const u32 div = offset / 4; @@ -2187,134 +2197,6 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3 } } -void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) -{ - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} - __kernel void m05800_init (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global androidpin_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** diff --git a/OpenCL/m05800.cl b/OpenCL/m05800.cl index ec03f6218..a447bf79e 100644 --- a/OpenCL/m05800.cl +++ b/OpenCL/m05800.cl @@ -2117,33 +2117,42 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3 const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign (append[4], append[3], offset_minus_4); - tmp5 = amd_bytealign ( 0, append[4], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); + u32 in4 = swap32_S (append[4]); - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = tmp5; - tmp5 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, in4, offset); + tmp5 = amd_bytealign (in4, 0, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); + tmp5 = swap32_S (tmp5); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], append[4], selector); - tmp5 = __byte_perm (append[4], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + u32 in4 = append[4]; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp3 = __byte_perm (in3, in4, selector); + tmp4 = __byte_perm (in4, 0, selector); #endif const u32 div = offset / 4; @@ -2188,134 +2197,6 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3 } } -void orig_sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) -{ - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} - __kernel void m05800_init (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global androidpin_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { const u32 gid = get_global_id (0); diff --git a/OpenCL/m06300-optimized.cl b/OpenCL/m06300-optimized.cl index 6b3980270..a0c85c49c 100644 --- a/OpenCL/m06300-optimized.cl +++ b/OpenCL/m06300-optimized.cl @@ -8,108 +8,11 @@ #include "inc_hash_functions.cl" #include "inc_types.cl" #include "inc_common.cl" +#include "inc_hash_md5.cl" #define COMPARE_S "inc_comp_single.cl" #define COMPARE_M "inc_comp_multi.cl" -void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[4]) { u32 tmp0; @@ -123,30 +26,37 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, 0, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, 0, selector); #endif const u32 div = offset / 4; @@ -229,30 +139,39 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0x80, append[3], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); + u32 in4 = 0x80000000; - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0x80; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, in4, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0x80, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + u32 in4 = 0x80; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, in4, selector); #endif const u32 div = offset / 4; @@ -333,24 +252,27 @@ void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign ( 0, append[1], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, 0, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, 0, selector); #endif const u32 div = offset / 4; diff --git a/OpenCL/m07400-optimized.cl b/OpenCL/m07400-optimized.cl index 3483c57ec..a52655b1b 100644 --- a/OpenCL/m07400-optimized.cl +++ b/OpenCL/m07400-optimized.cl @@ -32,8 +32,6 @@ __constant u32a k_sha256[64] = SHA256C3c, SHA256C3d, SHA256C3e, SHA256C3f, }; -#if 1 - void sha256_transform (const u32 w[16], u32 digest[8]) { u32 a = digest[0]; @@ -203,30 +201,37 @@ u32 memcat16 (u32 block[16], const u32 offset, const u32 append[4], const u32 ap const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, 0, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, 0, selector); #endif switch (offset / 4) @@ -337,30 +342,37 @@ u32 memcat16c (u32 block[16], const u32 offset, const u32 append[4], const u32 a const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, 0, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, 0, selector); #endif u32 carry[4] = { 0, 0, 0, 0 }; @@ -484,7 +496,7 @@ u32 memcat16c (u32 block[16], const u32 offset, const u32 append[4], const u32 a return new_len; } -u32 memcat20 (u32 block[20], const u32 offset, const u32 append[4], const u32 append_len) +u32 memcat20 (u32 block[32], const u32 offset, const u32 append[4], const u32 append_len) { u32 tmp0; u32 tmp1; @@ -497,30 +509,37 @@ u32 memcat20 (u32 block[20], const u32 offset, const u32 append[4], const u32 ap const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, 0, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, 0, selector); #endif switch (offset / 4) @@ -626,7 +645,7 @@ u32 memcat20 (u32 block[20], const u32 offset, const u32 append[4], const u32 ap return offset + append_len; } -u32 memcat20_x80 (u32 block[20], const u32 offset, const u32 append[4], const u32 append_len) +u32 memcat20_x80 (u32 block[32], const u32 offset, const u32 append[4], const u32 append_len) { u32 tmp0; u32 tmp1; @@ -639,30 +658,39 @@ u32 memcat20_x80 (u32 block[20], const u32 offset, const u32 append[4], const u3 const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0x80, append[3], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); + u32 in4 = 0x80000000; - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0x80; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, in4, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0x80, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + u32 in4 = 0x80; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, in4, selector); #endif switch (offset / 4) @@ -1201,543 +1229,3 @@ __kernel void m07400_comp (__global pw_t *pws, __global const kernel_rule_t *rul #include COMPARE_M } - -#else - -// this is basically a much cleaner version, but apparently drops speeds by over 100% :( - -#define PUTCHAR32_BE(a,p,c) ((u8 *)(a))[(p) ^ 3] = (u8) (c) -#define GETCHAR32_BE(a,p) ((u8 *)(a))[(p) ^ 3] - -typedef struct -{ - u32 state[8]; - u32 buf[32]; - int len; - -} sha256_ctx_t; - -void sha256_transform (const u32 w[16], u32 digest[8]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - u32 e = digest[4]; - u32 f = digest[5]; - u32 g = digest[6]; - u32 h = digest[7]; - - u32 w0_t = w[ 0]; - u32 w1_t = w[ 1]; - u32 w2_t = w[ 2]; - u32 w3_t = w[ 3]; - u32 w4_t = w[ 4]; - u32 w5_t = w[ 5]; - u32 w6_t = w[ 6]; - u32 w7_t = w[ 7]; - u32 w8_t = w[ 8]; - u32 w9_t = w[ 9]; - u32 wa_t = w[10]; - u32 wb_t = w[11]; - u32 wc_t = w[12]; - u32 wd_t = w[13]; - u32 we_t = w[14]; - u32 wf_t = w[15]; - - #define ROUND_EXPAND() \ - { \ - w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t); \ - w1_t = SHA256_EXPAND (wf_t, wa_t, w2_t, w1_t); \ - w2_t = SHA256_EXPAND (w0_t, wb_t, w3_t, w2_t); \ - w3_t = SHA256_EXPAND (w1_t, wc_t, w4_t, w3_t); \ - w4_t = SHA256_EXPAND (w2_t, wd_t, w5_t, w4_t); \ - w5_t = SHA256_EXPAND (w3_t, we_t, w6_t, w5_t); \ - w6_t = SHA256_EXPAND (w4_t, wf_t, w7_t, w6_t); \ - w7_t = SHA256_EXPAND (w5_t, w0_t, w8_t, w7_t); \ - w8_t = SHA256_EXPAND (w6_t, w1_t, w9_t, w8_t); \ - w9_t = SHA256_EXPAND (w7_t, w2_t, wa_t, w9_t); \ - wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); \ - wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); \ - wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); \ - wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); \ - we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); \ - wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); \ - } - - #define ROUND_STEP(i) \ - { \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, k_sha256[i + 0]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, k_sha256[i + 1]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, k_sha256[i + 2]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, k_sha256[i + 3]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, k_sha256[i + 4]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, k_sha256[i + 5]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, k_sha256[i + 6]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, k_sha256[i + 7]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, k_sha256[i + 8]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, k_sha256[i + 9]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, k_sha256[i + 10]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, k_sha256[i + 11]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, k_sha256[i + 12]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, k_sha256[i + 13]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, k_sha256[i + 14]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, k_sha256[i + 15]); \ - } - - ROUND_STEP (0); - - #ifdef _unroll - #pragma unroll - #endif - for (int i = 16; i < 64; i += 16) - { - ROUND_EXPAND (); ROUND_STEP (i); - } - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; - digest[4] += e; - digest[5] += f; - digest[6] += g; - digest[7] += h; -} - -void sha256_init (sha256_ctx_t *sha256_ctx) -{ - sha256_ctx->state[0] = SHA256M_A; - sha256_ctx->state[1] = SHA256M_B; - sha256_ctx->state[2] = SHA256M_C; - sha256_ctx->state[3] = SHA256M_D; - sha256_ctx->state[4] = SHA256M_E; - sha256_ctx->state[5] = SHA256M_F; - sha256_ctx->state[6] = SHA256M_G; - sha256_ctx->state[7] = SHA256M_H; - - sha256_ctx->len = 0; -} - -void sha256_update (sha256_ctx_t *sha256_ctx, const u32 *buf, int len) -{ - int pos = sha256_ctx->len & 0x3f; - - sha256_ctx->len += len; - - if ((pos + len) < 64) - { - for (int i = 0; i < len; i++) - { - PUTCHAR32_BE (sha256_ctx->buf, pos++, GETCHAR32_BE (buf, i)); - } - - return; - } - - int cnt = 64 - pos; - - for (int i = 0; i < cnt; i++) - { - PUTCHAR32_BE (sha256_ctx->buf, pos++, GETCHAR32_BE (buf, i)); - } - - sha256_transform (sha256_ctx->buf, sha256_ctx->state); - - len -= cnt; - - for (int i = 0; i < len; i++) - { - PUTCHAR32_BE (sha256_ctx->buf, i, GETCHAR32_BE (buf, cnt + i)); - } -} - -void sha256_final (sha256_ctx_t *sha256_ctx) -{ - int pos = sha256_ctx->len & 0x3f; - - for (int i = pos; i < 64; i++) - { - PUTCHAR32_BE (sha256_ctx->buf, i, 0); - } - - PUTCHAR32_BE (sha256_ctx->buf, pos, 0x80); - - if (pos >= 56) - { - sha256_transform (sha256_ctx->buf, sha256_ctx->state); - - sha256_ctx->buf[ 0] = 0; - sha256_ctx->buf[ 1] = 0; - sha256_ctx->buf[ 2] = 0; - sha256_ctx->buf[ 3] = 0; - sha256_ctx->buf[ 4] = 0; - sha256_ctx->buf[ 5] = 0; - sha256_ctx->buf[ 6] = 0; - sha256_ctx->buf[ 7] = 0; - sha256_ctx->buf[ 8] = 0; - sha256_ctx->buf[ 9] = 0; - sha256_ctx->buf[10] = 0; - sha256_ctx->buf[11] = 0; - sha256_ctx->buf[12] = 0; - sha256_ctx->buf[13] = 0; - sha256_ctx->buf[14] = 0; - sha256_ctx->buf[15] = 0; - } - - sha256_ctx->buf[15] = sha256_ctx->len * 8; - - sha256_transform (sha256_ctx->buf, sha256_ctx->state); -} - -__kernel void m07400_init (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global sha256crypt_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * base - */ - - const u32 gid = get_global_id (0); - - if (gid >= gid_max) return; - - u32 pw[4]; - - pw[0] = swap32 (pws[gid].i[0]); - pw[1] = swap32 (pws[gid].i[1]); - pw[2] = swap32 (pws[gid].i[2]); - pw[3] = swap32 (pws[gid].i[3]); - - const u32 pw_len = pws[gid].pw_len; - - /** - * salt - */ - - u32 salt[4]; - - salt[0] = swap32 (salt_bufs[salt_pos].salt_buf[0]); - salt[1] = swap32 (salt_bufs[salt_pos].salt_buf[1]); - salt[2] = swap32 (salt_bufs[salt_pos].salt_buf[2]); - salt[3] = swap32 (salt_bufs[salt_pos].salt_buf[3]); - - u32 salt_len = salt_bufs[salt_pos].salt_len; - - /** - * begin - */ - - sha256_ctx_t sha256_ctx; - - sha256_init (&sha256_ctx); - - sha256_update (&sha256_ctx, pw, pw_len); - sha256_update (&sha256_ctx, salt, salt_len); - sha256_update (&sha256_ctx, pw, pw_len); - - sha256_final (&sha256_ctx); - - u32 tmp[8]; - - tmp[0] = sha256_ctx.state[0]; - tmp[1] = sha256_ctx.state[1]; - tmp[2] = sha256_ctx.state[2]; - tmp[3] = sha256_ctx.state[3]; - tmp[4] = sha256_ctx.state[4]; - tmp[5] = sha256_ctx.state[5]; - tmp[6] = sha256_ctx.state[6]; - tmp[7] = sha256_ctx.state[7]; - - sha256_init (&sha256_ctx); - - sha256_update (&sha256_ctx, pw, pw_len); - sha256_update (&sha256_ctx, salt, salt_len); - sha256_update (&sha256_ctx, tmp, pw_len); - - for (u32 j = pw_len; j; j >>= 1) - { - if (j & 1) - { - sha256_update (&sha256_ctx, tmp, 32); - } - else - { - sha256_update (&sha256_ctx, pw, pw_len); - } - } - - sha256_final (&sha256_ctx); - - tmps[gid].alt_result[0] = sha256_ctx.state[0]; - tmps[gid].alt_result[1] = sha256_ctx.state[1]; - tmps[gid].alt_result[2] = sha256_ctx.state[2]; - tmps[gid].alt_result[3] = sha256_ctx.state[3]; - tmps[gid].alt_result[4] = sha256_ctx.state[4]; - tmps[gid].alt_result[5] = sha256_ctx.state[5]; - tmps[gid].alt_result[6] = sha256_ctx.state[6]; - tmps[gid].alt_result[7] = sha256_ctx.state[7]; - - // p_bytes - - sha256_init (&sha256_ctx); - - for (u32 j = 0; j < pw_len; j++) - { - sha256_update (&sha256_ctx, pw, pw_len); - } - - sha256_final (&sha256_ctx); - - tmps[gid].p_bytes[0] = sha256_ctx.state[0]; - tmps[gid].p_bytes[1] = sha256_ctx.state[1]; - tmps[gid].p_bytes[2] = sha256_ctx.state[2]; - tmps[gid].p_bytes[3] = sha256_ctx.state[3]; - - // s_bytes - - sha256_init (&sha256_ctx); - - for (u32 j = 0; j < 16 + ((tmps[gid].alt_result[0] >> 24) & 0xff); j++) - { - sha256_update (&sha256_ctx, salt, salt_len); - } - - sha256_final (&sha256_ctx); - - tmps[gid].s_bytes[0] = sha256_ctx.state[0]; - tmps[gid].s_bytes[1] = sha256_ctx.state[1]; - tmps[gid].s_bytes[2] = sha256_ctx.state[2]; - tmps[gid].s_bytes[3] = sha256_ctx.state[3]; -} - -__kernel void m07400_loop (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global sha256crypt_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * base - */ - - const u32 gid = get_global_id (0); - - if (gid >= gid_max) return; - - u32 p_bytes0[4]; - - p_bytes0[0] = tmps[gid].p_bytes[0]; - p_bytes0[1] = tmps[gid].p_bytes[1]; - p_bytes0[2] = tmps[gid].p_bytes[2]; - p_bytes0[3] = tmps[gid].p_bytes[3]; - - const u32 pw_len = pws[gid].pw_len; - - u32 s_bytes0[4]; - - s_bytes0[0] = tmps[gid].s_bytes[0]; - s_bytes0[1] = tmps[gid].s_bytes[1]; - s_bytes0[2] = tmps[gid].s_bytes[2]; - s_bytes0[3] = tmps[gid].s_bytes[3]; - - const u32 salt_len = salt_bufs[salt_pos].salt_len; - - u32 wpc_len[8]; - - wpc_len[0] = 32 + 0 + 0 + pw_len; - wpc_len[1] = pw_len + 0 + 0 + 32; - wpc_len[2] = 32 + salt_len + 0 + pw_len; - wpc_len[3] = pw_len + salt_len + 0 + 32; - wpc_len[4] = 32 + 0 + pw_len + pw_len; - wpc_len[5] = pw_len + 0 + pw_len + 32; - wpc_len[6] = 32 + salt_len + pw_len + pw_len; - wpc_len[7] = pw_len + salt_len + pw_len + 32; - - u32 wpc[8][32] = { { 0 } }; - - for (u32 i = 0; i < 8; i++) - { - u32 block_len = 0; - - if (i & 1) - { - for (u32 j = 0; j < pw_len; j++) - { - PUTCHAR32_BE (wpc[i], block_len++, GETCHAR32_BE (p_bytes0, j)); - } - } - else - { - block_len += 32; - } - - if (i & 2) - { - for (u32 j = 0; j < salt_len; j++) - { - PUTCHAR32_BE (wpc[i], block_len++, GETCHAR32_BE (s_bytes0, j)); - } - } - - if (i & 4) - { - for (u32 j = 0; j < pw_len; j++) - { - PUTCHAR32_BE (wpc[i], block_len++, GETCHAR32_BE (p_bytes0, j)); - } - } - - if (i & 1) - { - block_len += 32; - } - else - { - for (u32 j = 0; j < pw_len; j++) - { - PUTCHAR32_BE (wpc[i], block_len++, GETCHAR32_BE (p_bytes0, j)); - } - } - - PUTCHAR32_BE (wpc[i], block_len, 0x80); - - if (block_len < 56) - { - wpc[i][15] = block_len * 8; - } - else - { - wpc[i][31] = block_len * 8; - } - } - - /** - * base - */ - - u32 alt_result[8]; - - alt_result[0] = tmps[gid].alt_result[0]; - alt_result[1] = tmps[gid].alt_result[1]; - alt_result[2] = tmps[gid].alt_result[2]; - alt_result[3] = tmps[gid].alt_result[3]; - alt_result[4] = tmps[gid].alt_result[4]; - alt_result[5] = tmps[gid].alt_result[5]; - alt_result[6] = tmps[gid].alt_result[6]; - alt_result[7] = tmps[gid].alt_result[7]; - - /* Repeatedly run the collected hash value through SHA256 to burn - CPU cycles. */ - - for (u32 i = 0, j = loop_pos; i < loop_cnt; i++, j++) - { - const u32 j1 = (j & 1) ? 1 : 0; - const u32 j3 = (j % 3) ? 2 : 0; - const u32 j7 = (j % 7) ? 4 : 0; - - const u32 pc = j1 + j3 + j7; - - u32 block[32]; - - block[ 0] = wpc[pc][ 0]; - block[ 1] = wpc[pc][ 1]; - block[ 2] = wpc[pc][ 2]; - block[ 3] = wpc[pc][ 3]; - block[ 4] = wpc[pc][ 4]; - block[ 5] = wpc[pc][ 5]; - block[ 6] = wpc[pc][ 6]; - block[ 7] = wpc[pc][ 7]; - block[ 8] = wpc[pc][ 8]; - block[ 9] = wpc[pc][ 9]; - block[10] = wpc[pc][10]; - block[11] = wpc[pc][11]; - block[12] = wpc[pc][12]; - block[13] = wpc[pc][13]; - block[14] = wpc[pc][14]; - block[15] = wpc[pc][15]; - block[16] = wpc[pc][16]; - block[17] = wpc[pc][17]; - block[18] = wpc[pc][18]; - block[19] = wpc[pc][19]; - block[20] = wpc[pc][20]; - block[21] = wpc[pc][21]; - block[22] = wpc[pc][22]; - block[23] = wpc[pc][23]; - block[24] = wpc[pc][24]; - block[25] = wpc[pc][25]; - block[26] = wpc[pc][26]; - block[27] = wpc[pc][27]; - block[28] = wpc[pc][28]; - block[29] = wpc[pc][29]; - block[30] = wpc[pc][30]; - block[31] = wpc[pc][31]; - - const u32 block_len = wpc_len[pc]; - - if (j1) - { - #ifdef _unroll - #pragma unroll - #endif - for (u32 k = 0, p = block_len - 32; k < 32; k++, p++) - { - PUTCHAR32_BE (block, p, GETCHAR32_BE (alt_result, k)); - } - } - else - { - block[0] = alt_result[0]; - block[1] = alt_result[1]; - block[2] = alt_result[2]; - block[3] = alt_result[3]; - block[4] = alt_result[4]; - block[5] = alt_result[5]; - block[6] = alt_result[6]; - block[7] = alt_result[7]; - } - - alt_result[0] = SHA256M_A; - alt_result[1] = SHA256M_B; - alt_result[2] = SHA256M_C; - alt_result[3] = SHA256M_D; - alt_result[4] = SHA256M_E; - alt_result[5] = SHA256M_F; - alt_result[6] = SHA256M_G; - alt_result[7] = SHA256M_H; - - sha256_transform (block, alt_result); - - if (block_len >= 56) - { - sha256_transform (block + 16, alt_result); - } - } - - tmps[gid].alt_result[0] = alt_result[0]; - tmps[gid].alt_result[1] = alt_result[1]; - tmps[gid].alt_result[2] = alt_result[2]; - tmps[gid].alt_result[3] = alt_result[3]; - tmps[gid].alt_result[4] = alt_result[4]; - tmps[gid].alt_result[5] = alt_result[5]; - tmps[gid].alt_result[6] = alt_result[6]; - tmps[gid].alt_result[7] = alt_result[7]; -} - -__kernel void m07400_comp (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global sha256crypt_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * base - */ - - const u32 gid = get_global_id (0); - - if (gid >= gid_max) return; - - const u32 lid = get_local_id (0); - - const u32 r0 = swap32 (tmps[gid].alt_result[0]); - const u32 r1 = swap32 (tmps[gid].alt_result[1]); - const u32 r2 = swap32 (tmps[gid].alt_result[2]); - const u32 r3 = swap32 (tmps[gid].alt_result[3]); - - #define il_pos 0 - - #include COMPARE_M -} - -#endif diff --git a/OpenCL/m11400_a0-optimized.cl b/OpenCL/m11400_a0-optimized.cl index 58d7f4d5c..4e740b646 100644 --- a/OpenCL/m11400_a0-optimized.cl +++ b/OpenCL/m11400_a0-optimized.cl @@ -27,76 +27,79 @@ #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif -u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len) +u32 memcat32 (u32x block0[16], u32x block1[16], const u32 offset, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len) { - const u32 mod = block_len & 3; - const u32 div = block_len / 4; + const u32 mod = offset & 3; + const u32 div = offset / 4; #if defined IS_AMD || defined IS_GENERIC const int offset_minus_4 = 4 - mod; + u32x append00 = swap32 (append0[0]); + u32x append01 = swap32 (append0[1]); + u32x append02 = swap32 (append0[2]); + u32x append03 = swap32 (append0[3]); + u32x append10 = swap32 (append1[0]); + u32x append11 = swap32 (append1[1]); + u32x append12 = swap32 (append1[2]); + u32x append13 = swap32 (append1[3]); + u32x append20 = swap32 (append2[0]); + u32x append21 = swap32 (append2[1]); + u32x append22 = swap32 (append2[2]); + u32x append23 = swap32 (append2[3]); + u32x append30 = swap32 (append3[0]); + u32x append31 = swap32 (append3[1]); + u32x append32 = swap32 (append3[2]); + u32x append33 = swap32 (append3[3]); + u32x append0_t[4]; - - append0_t[0] = amd_bytealign (append0[0], 0, offset_minus_4); - append0_t[1] = amd_bytealign (append0[1], append0[0], offset_minus_4); - append0_t[2] = amd_bytealign (append0[2], append0[1], offset_minus_4); - append0_t[3] = amd_bytealign (append0[3], append0[2], offset_minus_4); - u32x append1_t[4]; - - append1_t[0] = amd_bytealign (append1[0], append0[3], offset_minus_4); - append1_t[1] = amd_bytealign (append1[1], append1[0], offset_minus_4); - append1_t[2] = amd_bytealign (append1[2], append1[1], offset_minus_4); - append1_t[3] = amd_bytealign (append1[3], append1[2], offset_minus_4); - u32x append2_t[4]; - - append2_t[0] = amd_bytealign (append2[0], append1[3], offset_minus_4); - append2_t[1] = amd_bytealign (append2[1], append2[0], offset_minus_4); - append2_t[2] = amd_bytealign (append2[2], append2[1], offset_minus_4); - append2_t[3] = amd_bytealign (append2[3], append2[2], offset_minus_4); - u32x append3_t[4]; - - append3_t[0] = amd_bytealign (append3[0], append2[3], offset_minus_4); - append3_t[1] = amd_bytealign (append3[1], append3[0], offset_minus_4); - append3_t[2] = amd_bytealign (append3[2], append3[1], offset_minus_4); - append3_t[3] = amd_bytealign (append3[3], append3[2], offset_minus_4); - u32x append4_t[4]; - append4_t[0] = amd_bytealign ( 0, append3[3], offset_minus_4); + append0_t[0] = amd_bytealign ( 0, append00, offset); + append0_t[1] = amd_bytealign (append00, append01, offset); + append0_t[2] = amd_bytealign (append01, append02, offset); + append0_t[3] = amd_bytealign (append02, append03, offset); + append1_t[0] = amd_bytealign (append03, append10, offset); + append1_t[1] = amd_bytealign (append10, append11, offset); + append1_t[2] = amd_bytealign (append11, append12, offset); + append1_t[3] = amd_bytealign (append12, append13, offset); + append2_t[0] = amd_bytealign (append13, append20, offset); + append2_t[1] = amd_bytealign (append20, append21, offset); + append2_t[2] = amd_bytealign (append21, append22, offset); + append2_t[3] = amd_bytealign (append22, append23, offset); + append3_t[0] = amd_bytealign (append23, append30, offset); + append3_t[1] = amd_bytealign (append30, append31, offset); + append3_t[2] = amd_bytealign (append31, append32, offset); + append3_t[3] = amd_bytealign (append32, append33, offset); + append4_t[0] = amd_bytealign (append33, 0, offset); append4_t[1] = 0; append4_t[2] = 0; append4_t[3] = 0; - if (mod == 0) - { - append0_t[0] = append0[0]; - append0_t[1] = append0[1]; - append0_t[2] = append0[2]; - append0_t[3] = append0[3]; + append0_t[0] = swap32 (append0_t[0]); + append0_t[1] = swap32 (append0_t[1]); + append0_t[2] = swap32 (append0_t[2]); + append0_t[3] = swap32 (append0_t[3]); + append1_t[0] = swap32 (append1_t[0]); + append1_t[1] = swap32 (append1_t[1]); + append1_t[2] = swap32 (append1_t[2]); + append1_t[3] = swap32 (append1_t[3]); + append2_t[0] = swap32 (append2_t[0]); + append2_t[1] = swap32 (append2_t[1]); + append2_t[2] = swap32 (append2_t[2]); + append2_t[3] = swap32 (append2_t[3]); + append3_t[0] = swap32 (append3_t[0]); + append3_t[1] = swap32 (append3_t[1]); + append3_t[2] = swap32 (append3_t[2]); + append3_t[3] = swap32 (append3_t[3]); + append4_t[0] = swap32 (append4_t[0]); + append4_t[1] = swap32 (append4_t[1]); + append4_t[2] = swap32 (append4_t[2]); + append4_t[3] = swap32 (append4_t[3]); - append1_t[0] = append1[0]; - append1_t[1] = append1[1]; - append1_t[2] = append1[2]; - append1_t[3] = append1[3]; - - append2_t[0] = append2[0]; - append2_t[1] = append2[1]; - append2_t[2] = append2[2]; - append2_t[3] = append2[3]; - - append3_t[0] = append3[0]; - append3_t[1] = append3[1]; - append3_t[2] = append3[2]; - append3_t[3] = append3[3]; - - append4_t[0] = 0; - append4_t[1] = 0; - append4_t[2] = 0; - append4_t[3] = 0; - } #endif #ifdef IS_NV @@ -105,40 +108,50 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + u32x append00 = append0[0]; + u32x append01 = append0[1]; + u32x append02 = append0[2]; + u32x append03 = append0[3]; + u32x append10 = append1[0]; + u32x append11 = append1[1]; + u32x append12 = append1[2]; + u32x append13 = append1[3]; + u32x append20 = append2[0]; + u32x append21 = append2[1]; + u32x append22 = append2[2]; + u32x append23 = append2[3]; + u32x append30 = append3[0]; + u32x append31 = append3[1]; + u32x append32 = append3[2]; + u32x append33 = append3[3]; + u32x append0_t[4]; - - append0_t[0] = __byte_perm ( 0, append0[0], selector); - append0_t[1] = __byte_perm (append0[0], append0[1], selector); - append0_t[2] = __byte_perm (append0[1], append0[2], selector); - append0_t[3] = __byte_perm (append0[2], append0[3], selector); - u32x append1_t[4]; - - append1_t[0] = __byte_perm (append0[3], append1[0], selector); - append1_t[1] = __byte_perm (append1[0], append1[1], selector); - append1_t[2] = __byte_perm (append1[1], append1[2], selector); - append1_t[3] = __byte_perm (append1[2], append1[3], selector); - u32x append2_t[4]; - - append2_t[0] = __byte_perm (append1[3], append2[0], selector); - append2_t[1] = __byte_perm (append2[0], append2[1], selector); - append2_t[2] = __byte_perm (append2[1], append2[2], selector); - append2_t[3] = __byte_perm (append2[2], append2[3], selector); - u32x append3_t[4]; - - append3_t[0] = __byte_perm (append2[3], append3[0], selector); - append3_t[1] = __byte_perm (append3[0], append3[1], selector); - append3_t[2] = __byte_perm (append3[1], append3[2], selector); - append3_t[3] = __byte_perm (append3[2], append3[3], selector); - u32x append4_t[4]; - append4_t[0] = __byte_perm (append3[3], 0, selector); + append0_t[0] = __byte_perm ( 0, append00, selector); + append0_t[1] = __byte_perm (append00, append01, selector); + append0_t[2] = __byte_perm (append01, append02, selector); + append0_t[3] = __byte_perm (append02, append03, selector); + append1_t[0] = __byte_perm (append03, append10, selector); + append1_t[1] = __byte_perm (append10, append11, selector); + append1_t[2] = __byte_perm (append11, append12, selector); + append1_t[3] = __byte_perm (append12, append13, selector); + append2_t[0] = __byte_perm (append13, append20, selector); + append2_t[1] = __byte_perm (append20, append21, selector); + append2_t[2] = __byte_perm (append21, append22, selector); + append2_t[3] = __byte_perm (append22, append23, selector); + append3_t[0] = __byte_perm (append23, append30, selector); + append3_t[1] = __byte_perm (append30, append31, selector); + append3_t[2] = __byte_perm (append31, append32, selector); + append3_t[3] = __byte_perm (append32, append33, selector); + append4_t[0] = __byte_perm (append33, 0, selector); append4_t[1] = 0; append4_t[2] = 0; append4_t[3] = 0; + #endif switch (div) @@ -147,22 +160,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 1] = append0_t[1]; block0[ 2] = append0_t[2]; block0[ 3] = append0_t[3]; - block0[ 4] = append1_t[0]; block0[ 5] = append1_t[1]; block0[ 6] = append1_t[2]; block0[ 7] = append1_t[3]; - block0[ 8] = append2_t[0]; block0[ 9] = append2_t[1]; block0[10] = append2_t[2]; block0[11] = append2_t[3]; - block0[12] = append3_t[0]; block0[13] = append3_t[1]; block0[14] = append3_t[2]; block0[15] = append3_t[3]; - block1[ 0] = append4_t[0]; block1[ 1] = append4_t[1]; block1[ 2] = append4_t[2]; @@ -173,22 +182,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 2] = append0_t[1]; block0[ 3] = append0_t[2]; block0[ 4] = append0_t[3]; - block0[ 5] = append1_t[0]; block0[ 6] = append1_t[1]; block0[ 7] = append1_t[2]; block0[ 8] = append1_t[3]; - block0[ 9] = append2_t[0]; block0[10] = append2_t[1]; block0[11] = append2_t[2]; block0[12] = append2_t[3]; - block0[13] = append3_t[0]; block0[14] = append3_t[1]; block0[15] = append3_t[2]; block1[ 0] = append3_t[3]; - block1[ 1] = append4_t[0]; block1[ 2] = append4_t[1]; block1[ 3] = append4_t[2]; @@ -199,22 +204,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 3] = append0_t[1]; block0[ 4] = append0_t[2]; block0[ 5] = append0_t[3]; - block0[ 6] = append1_t[0]; block0[ 7] = append1_t[1]; block0[ 8] = append1_t[2]; block0[ 9] = append1_t[3]; - block0[10] = append2_t[0]; block0[11] = append2_t[1]; block0[12] = append2_t[2]; block0[13] = append2_t[3]; - block0[14] = append3_t[0]; block0[15] = append3_t[1]; block1[ 0] = append3_t[2]; block1[ 1] = append3_t[3]; - block1[ 2] = append4_t[0]; block1[ 3] = append4_t[1]; block1[ 4] = append4_t[2]; @@ -225,22 +226,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 4] = append0_t[1]; block0[ 5] = append0_t[2]; block0[ 6] = append0_t[3]; - block0[ 7] = append1_t[0]; block0[ 8] = append1_t[1]; block0[ 9] = append1_t[2]; block0[10] = append1_t[3]; - block0[11] = append2_t[0]; block0[12] = append2_t[1]; block0[13] = append2_t[2]; block0[14] = append2_t[3]; - block0[15] = append3_t[0]; block1[ 0] = append3_t[1]; block1[ 1] = append3_t[2]; block1[ 2] = append3_t[3]; - block1[ 3] = append4_t[0]; block1[ 4] = append4_t[1]; block1[ 5] = append4_t[2]; @@ -251,22 +248,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 5] = append0_t[1]; block0[ 6] = append0_t[2]; block0[ 7] = append0_t[3]; - block0[ 8] = append1_t[0]; block0[ 9] = append1_t[1]; block0[10] = append1_t[2]; block0[11] = append1_t[3]; - block0[12] = append2_t[0]; block0[13] = append2_t[1]; block0[14] = append2_t[2]; block0[15] = append2_t[3]; - block1[ 0] = append3_t[0]; block1[ 1] = append3_t[1]; block1[ 2] = append3_t[2]; block1[ 3] = append3_t[3]; - block1[ 4] = append4_t[0]; block1[ 5] = append4_t[1]; block1[ 6] = append4_t[2]; @@ -277,22 +270,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 6] = append0_t[1]; block0[ 7] = append0_t[2]; block0[ 8] = append0_t[3]; - block0[ 9] = append1_t[0]; block0[10] = append1_t[1]; block0[11] = append1_t[2]; block0[12] = append1_t[3]; - block0[13] = append2_t[0]; block0[14] = append2_t[1]; block0[15] = append2_t[2]; block1[ 0] = append2_t[3]; - block1[ 1] = append3_t[0]; block1[ 2] = append3_t[1]; block1[ 3] = append3_t[2]; block1[ 4] = append3_t[3]; - block1[ 5] = append4_t[0]; block1[ 6] = append4_t[1]; block1[ 7] = append4_t[2]; @@ -303,22 +292,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 7] = append0_t[1]; block0[ 8] = append0_t[2]; block0[ 9] = append0_t[3]; - block0[10] = append1_t[0]; block0[11] = append1_t[1]; block0[12] = append1_t[2]; block0[13] = append1_t[3]; - block0[14] = append2_t[0]; block0[15] = append2_t[1]; block1[ 0] = append2_t[2]; block1[ 1] = append2_t[3]; - block1[ 2] = append3_t[0]; block1[ 3] = append3_t[1]; block1[ 4] = append3_t[2]; block1[ 5] = append3_t[3]; - block1[ 6] = append4_t[0]; block1[ 7] = append4_t[1]; block1[ 8] = append4_t[2]; @@ -329,22 +314,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 8] = append0_t[1]; block0[ 9] = append0_t[2]; block0[10] = append0_t[3]; - block0[11] = append1_t[0]; block0[12] = append1_t[1]; block0[13] = append1_t[2]; block0[14] = append1_t[3]; - block0[15] = append2_t[0]; block1[ 0] = append2_t[1]; block1[ 1] = append2_t[2]; block1[ 2] = append2_t[3]; - block1[ 3] = append3_t[0]; block1[ 4] = append3_t[1]; block1[ 5] = append3_t[2]; block1[ 6] = append3_t[3]; - block1[ 7] = append4_t[0]; block1[ 8] = append4_t[1]; block1[ 9] = append4_t[2]; @@ -355,22 +336,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 9] = append0_t[1]; block0[10] = append0_t[2]; block0[11] = append0_t[3]; - block0[12] = append1_t[0]; block0[13] = append1_t[1]; block0[14] = append1_t[2]; block0[15] = append1_t[3]; - block1[ 0] = append2_t[0]; block1[ 1] = append2_t[1]; block1[ 2] = append2_t[2]; block1[ 3] = append2_t[3]; - block1[ 4] = append3_t[0]; block1[ 5] = append3_t[1]; block1[ 6] = append3_t[2]; block1[ 7] = append3_t[3]; - block1[ 8] = append4_t[0]; block1[ 9] = append4_t[1]; block1[10] = append4_t[2]; @@ -381,22 +358,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[10] = append0_t[1]; block0[11] = append0_t[2]; block0[12] = append0_t[3]; - block0[13] = append1_t[0]; block0[14] = append1_t[1]; block0[15] = append1_t[2]; block1[ 0] = append1_t[3]; - block1[ 1] = append2_t[0]; block1[ 2] = append2_t[1]; block1[ 3] = append2_t[2]; block1[ 4] = append2_t[3]; - block1[ 5] = append3_t[0]; block1[ 6] = append3_t[1]; block1[ 7] = append3_t[2]; block1[ 8] = append3_t[3]; - block1[ 9] = append4_t[0]; block1[10] = append4_t[1]; block1[11] = append4_t[2]; @@ -407,22 +380,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[11] = append0_t[1]; block0[12] = append0_t[2]; block0[13] = append0_t[3]; - block0[14] = append1_t[0]; block0[15] = append1_t[1]; block1[ 0] = append1_t[2]; block1[ 1] = append1_t[3]; - block1[ 2] = append2_t[0]; block1[ 3] = append2_t[1]; block1[ 4] = append2_t[2]; block1[ 5] = append2_t[3]; - block1[ 6] = append3_t[0]; block1[ 7] = append3_t[1]; block1[ 8] = append3_t[2]; block1[ 9] = append3_t[3]; - block1[10] = append4_t[0]; block1[11] = append4_t[1]; block1[12] = append4_t[2]; @@ -433,22 +402,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[12] = append0_t[1]; block0[13] = append0_t[2]; block0[14] = append0_t[3]; - block0[15] = append1_t[0]; block1[ 0] = append1_t[1]; block1[ 1] = append1_t[2]; block1[ 2] = append1_t[3]; - block1[ 3] = append2_t[0]; block1[ 4] = append2_t[1]; block1[ 5] = append2_t[2]; block1[ 6] = append2_t[3]; - block1[ 7] = append3_t[0]; block1[ 8] = append3_t[1]; block1[ 9] = append3_t[2]; block1[10] = append3_t[3]; - block1[11] = append4_t[0]; block1[12] = append4_t[1]; block1[13] = append4_t[2]; @@ -459,22 +424,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[13] = append0_t[1]; block0[14] = append0_t[2]; block0[15] = append0_t[3]; - block1[ 0] = append1_t[0]; block1[ 1] = append1_t[1]; block1[ 2] = append1_t[2]; block1[ 3] = append1_t[3]; - block1[ 4] = append2_t[0]; block1[ 5] = append2_t[1]; block1[ 6] = append2_t[2]; block1[ 7] = append2_t[3]; - block1[ 8] = append3_t[0]; block1[ 9] = append3_t[1]; block1[10] = append3_t[2]; block1[11] = append3_t[3]; - block1[12] = append4_t[0]; block1[13] = append4_t[1]; block1[14] = append4_t[2]; @@ -485,22 +446,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[14] = append0_t[1]; block0[15] = append0_t[2]; block1[ 0] = append0_t[3]; - block1[ 1] = append1_t[0]; block1[ 2] = append1_t[1]; block1[ 3] = append1_t[2]; block1[ 4] = append1_t[3]; - block1[ 5] = append2_t[0]; block1[ 6] = append2_t[1]; block1[ 7] = append2_t[2]; block1[ 8] = append2_t[3]; - block1[ 9] = append3_t[0]; block1[10] = append3_t[1]; block1[11] = append3_t[2]; block1[12] = append3_t[3]; - block1[13] = append4_t[0]; block1[14] = append4_t[1]; block1[15] = append4_t[2]; @@ -510,22 +467,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[15] = append0_t[1]; block1[ 0] = append0_t[2]; block1[ 1] = append0_t[3]; - block1[ 2] = append1_t[0]; block1[ 3] = append1_t[1]; block1[ 4] = append1_t[2]; block1[ 5] = append1_t[3]; - block1[ 6] = append2_t[0]; block1[ 7] = append2_t[1]; block1[ 8] = append2_t[2]; block1[ 9] = append2_t[3]; - block1[10] = append3_t[0]; block1[11] = append3_t[1]; block1[12] = append3_t[2]; block1[13] = append3_t[3]; - block1[14] = append4_t[0]; block1[15] = append4_t[1]; break; @@ -534,22 +487,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 0] = append0_t[1]; block1[ 1] = append0_t[2]; block1[ 2] = append0_t[3]; - block1[ 3] = append1_t[1]; block1[ 4] = append1_t[2]; block1[ 5] = append1_t[3]; block1[ 6] = append1_t[0]; - block1[ 7] = append2_t[0]; block1[ 8] = append2_t[1]; block1[ 9] = append2_t[2]; block1[10] = append2_t[3]; - block1[11] = append3_t[0]; block1[12] = append3_t[1]; block1[13] = append3_t[2]; block1[14] = append3_t[3]; - block1[15] = append4_t[0]; break; @@ -557,17 +506,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 1] = append0_t[1]; block1[ 2] = append0_t[2]; block1[ 3] = append0_t[3]; - block1[ 4] = append1_t[0]; block1[ 5] = append1_t[1]; block1[ 6] = append1_t[2]; block1[ 7] = append1_t[3]; - block1[ 8] = append2_t[0]; block1[ 9] = append2_t[1]; block1[10] = append2_t[2]; block1[11] = append2_t[3]; - block1[12] = append3_t[0]; block1[13] = append3_t[1]; block1[14] = append3_t[2]; @@ -578,17 +524,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 2] = append0_t[1]; block1[ 3] = append0_t[2]; block1[ 4] = append0_t[3]; - block1[ 5] = append1_t[0]; block1[ 6] = append1_t[1]; block1[ 7] = append1_t[2]; block1[ 8] = append1_t[3]; - block1[ 9] = append2_t[0]; block1[10] = append2_t[1]; block1[11] = append2_t[2]; block1[12] = append2_t[3]; - block1[13] = append3_t[0]; block1[14] = append3_t[1]; block1[15] = append3_t[2]; @@ -598,17 +541,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 3] = append0_t[1]; block1[ 4] = append0_t[2]; block1[ 5] = append0_t[3]; - block1[ 6] = append1_t[0]; block1[ 7] = append1_t[1]; block1[ 8] = append1_t[2]; block1[ 9] = append1_t[3]; - block1[10] = append2_t[0]; block1[11] = append2_t[1]; block1[12] = append2_t[2]; block1[13] = append2_t[3]; - block1[14] = append3_t[0]; block1[15] = append3_t[1]; break; @@ -617,17 +557,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 4] = append0_t[1]; block1[ 5] = append0_t[2]; block1[ 6] = append0_t[3]; - block1[ 7] = append1_t[0]; block1[ 8] = append1_t[1]; block1[ 9] = append1_t[2]; block1[10] = append1_t[3]; - block1[11] = append2_t[0]; block1[12] = append2_t[1]; block1[13] = append2_t[2]; block1[14] = append2_t[3]; - block1[15] = append3_t[0]; break; @@ -635,12 +572,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 5] = append0_t[1]; block1[ 6] = append0_t[2]; block1[ 7] = append0_t[3]; - block1[ 8] = append1_t[0]; block1[ 9] = append1_t[1]; block1[10] = append1_t[2]; block1[11] = append1_t[3]; - block1[12] = append2_t[0]; block1[13] = append2_t[1]; block1[14] = append2_t[2]; @@ -651,12 +586,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 6] = append0_t[1]; block1[ 7] = append0_t[2]; block1[ 8] = append0_t[3]; - block1[ 9] = append1_t[0]; block1[10] = append1_t[1]; block1[11] = append1_t[2]; block1[12] = append1_t[3]; - block1[13] = append2_t[0]; block1[14] = append2_t[1]; block1[15] = append2_t[2]; @@ -666,12 +599,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 7] = append0_t[1]; block1[ 8] = append0_t[2]; block1[ 9] = append0_t[3]; - block1[10] = append1_t[0]; block1[11] = append1_t[1]; block1[12] = append1_t[2]; block1[13] = append1_t[3]; - block1[14] = append2_t[0]; block1[15] = append2_t[1]; break; @@ -680,12 +611,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 8] = append0_t[1]; block1[ 9] = append0_t[2]; block1[10] = append0_t[3]; - block1[11] = append1_t[0]; block1[12] = append1_t[1]; block1[13] = append1_t[2]; block1[14] = append1_t[3]; - block1[15] = append2_t[0]; break; @@ -693,7 +622,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 9] = append0_t[1]; block1[10] = append0_t[2]; block1[11] = append0_t[3]; - block1[12] = append1_t[0]; block1[13] = append1_t[1]; block1[14] = append1_t[2]; @@ -704,7 +632,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[10] = append0_t[1]; block1[11] = append0_t[2]; block1[12] = append0_t[3]; - block1[13] = append1_t[0]; block1[14] = append1_t[1]; block1[15] = append1_t[2]; @@ -714,7 +641,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[11] = append0_t[1]; block1[12] = append0_t[2]; block1[13] = append0_t[3]; - block1[14] = append1_t[0]; block1[15] = append1_t[1]; break; @@ -723,7 +649,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[12] = append0_t[1]; block1[13] = append0_t[2]; block1[14] = append0_t[3]; - block1[15] = append1_t[0]; break; @@ -743,7 +668,7 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x break; } - u32 new_len = block_len + append_len; + u32 new_len = offset + append_len; return new_len; } diff --git a/OpenCL/m11400_a1-optimized.cl b/OpenCL/m11400_a1-optimized.cl index 4fc47020e..4df896258 100644 --- a/OpenCL/m11400_a1-optimized.cl +++ b/OpenCL/m11400_a1-optimized.cl @@ -25,76 +25,79 @@ #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif -u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len) +u32 memcat32 (u32x block0[16], u32x block1[16], const u32 offset, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len) { - const u32 mod = block_len & 3; - const u32 div = block_len / 4; + const u32 mod = offset & 3; + const u32 div = offset / 4; #if defined IS_AMD || defined IS_GENERIC const int offset_minus_4 = 4 - mod; + u32x append00 = swap32 (append0[0]); + u32x append01 = swap32 (append0[1]); + u32x append02 = swap32 (append0[2]); + u32x append03 = swap32 (append0[3]); + u32x append10 = swap32 (append1[0]); + u32x append11 = swap32 (append1[1]); + u32x append12 = swap32 (append1[2]); + u32x append13 = swap32 (append1[3]); + u32x append20 = swap32 (append2[0]); + u32x append21 = swap32 (append2[1]); + u32x append22 = swap32 (append2[2]); + u32x append23 = swap32 (append2[3]); + u32x append30 = swap32 (append3[0]); + u32x append31 = swap32 (append3[1]); + u32x append32 = swap32 (append3[2]); + u32x append33 = swap32 (append3[3]); + u32x append0_t[4]; - - append0_t[0] = amd_bytealign (append0[0], 0, offset_minus_4); - append0_t[1] = amd_bytealign (append0[1], append0[0], offset_minus_4); - append0_t[2] = amd_bytealign (append0[2], append0[1], offset_minus_4); - append0_t[3] = amd_bytealign (append0[3], append0[2], offset_minus_4); - u32x append1_t[4]; - - append1_t[0] = amd_bytealign (append1[0], append0[3], offset_minus_4); - append1_t[1] = amd_bytealign (append1[1], append1[0], offset_minus_4); - append1_t[2] = amd_bytealign (append1[2], append1[1], offset_minus_4); - append1_t[3] = amd_bytealign (append1[3], append1[2], offset_minus_4); - u32x append2_t[4]; - - append2_t[0] = amd_bytealign (append2[0], append1[3], offset_minus_4); - append2_t[1] = amd_bytealign (append2[1], append2[0], offset_minus_4); - append2_t[2] = amd_bytealign (append2[2], append2[1], offset_minus_4); - append2_t[3] = amd_bytealign (append2[3], append2[2], offset_minus_4); - u32x append3_t[4]; - - append3_t[0] = amd_bytealign (append3[0], append2[3], offset_minus_4); - append3_t[1] = amd_bytealign (append3[1], append3[0], offset_minus_4); - append3_t[2] = amd_bytealign (append3[2], append3[1], offset_minus_4); - append3_t[3] = amd_bytealign (append3[3], append3[2], offset_minus_4); - u32x append4_t[4]; - append4_t[0] = amd_bytealign ( 0, append3[3], offset_minus_4); + append0_t[0] = amd_bytealign ( 0, append00, offset); + append0_t[1] = amd_bytealign (append00, append01, offset); + append0_t[2] = amd_bytealign (append01, append02, offset); + append0_t[3] = amd_bytealign (append02, append03, offset); + append1_t[0] = amd_bytealign (append03, append10, offset); + append1_t[1] = amd_bytealign (append10, append11, offset); + append1_t[2] = amd_bytealign (append11, append12, offset); + append1_t[3] = amd_bytealign (append12, append13, offset); + append2_t[0] = amd_bytealign (append13, append20, offset); + append2_t[1] = amd_bytealign (append20, append21, offset); + append2_t[2] = amd_bytealign (append21, append22, offset); + append2_t[3] = amd_bytealign (append22, append23, offset); + append3_t[0] = amd_bytealign (append23, append30, offset); + append3_t[1] = amd_bytealign (append30, append31, offset); + append3_t[2] = amd_bytealign (append31, append32, offset); + append3_t[3] = amd_bytealign (append32, append33, offset); + append4_t[0] = amd_bytealign (append33, 0, offset); append4_t[1] = 0; append4_t[2] = 0; append4_t[3] = 0; - if (mod == 0) - { - append0_t[0] = append0[0]; - append0_t[1] = append0[1]; - append0_t[2] = append0[2]; - append0_t[3] = append0[3]; + append0_t[0] = swap32 (append0_t[0]); + append0_t[1] = swap32 (append0_t[1]); + append0_t[2] = swap32 (append0_t[2]); + append0_t[3] = swap32 (append0_t[3]); + append1_t[0] = swap32 (append1_t[0]); + append1_t[1] = swap32 (append1_t[1]); + append1_t[2] = swap32 (append1_t[2]); + append1_t[3] = swap32 (append1_t[3]); + append2_t[0] = swap32 (append2_t[0]); + append2_t[1] = swap32 (append2_t[1]); + append2_t[2] = swap32 (append2_t[2]); + append2_t[3] = swap32 (append2_t[3]); + append3_t[0] = swap32 (append3_t[0]); + append3_t[1] = swap32 (append3_t[1]); + append3_t[2] = swap32 (append3_t[2]); + append3_t[3] = swap32 (append3_t[3]); + append4_t[0] = swap32 (append4_t[0]); + append4_t[1] = swap32 (append4_t[1]); + append4_t[2] = swap32 (append4_t[2]); + append4_t[3] = swap32 (append4_t[3]); - append1_t[0] = append1[0]; - append1_t[1] = append1[1]; - append1_t[2] = append1[2]; - append1_t[3] = append1[3]; - - append2_t[0] = append2[0]; - append2_t[1] = append2[1]; - append2_t[2] = append2[2]; - append2_t[3] = append2[3]; - - append3_t[0] = append3[0]; - append3_t[1] = append3[1]; - append3_t[2] = append3[2]; - append3_t[3] = append3[3]; - - append4_t[0] = 0; - append4_t[1] = 0; - append4_t[2] = 0; - append4_t[3] = 0; - } #endif #ifdef IS_NV @@ -103,40 +106,50 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + u32x append00 = append0[0]; + u32x append01 = append0[1]; + u32x append02 = append0[2]; + u32x append03 = append0[3]; + u32x append10 = append1[0]; + u32x append11 = append1[1]; + u32x append12 = append1[2]; + u32x append13 = append1[3]; + u32x append20 = append2[0]; + u32x append21 = append2[1]; + u32x append22 = append2[2]; + u32x append23 = append2[3]; + u32x append30 = append3[0]; + u32x append31 = append3[1]; + u32x append32 = append3[2]; + u32x append33 = append3[3]; + u32x append0_t[4]; - - append0_t[0] = __byte_perm ( 0, append0[0], selector); - append0_t[1] = __byte_perm (append0[0], append0[1], selector); - append0_t[2] = __byte_perm (append0[1], append0[2], selector); - append0_t[3] = __byte_perm (append0[2], append0[3], selector); - u32x append1_t[4]; - - append1_t[0] = __byte_perm (append0[3], append1[0], selector); - append1_t[1] = __byte_perm (append1[0], append1[1], selector); - append1_t[2] = __byte_perm (append1[1], append1[2], selector); - append1_t[3] = __byte_perm (append1[2], append1[3], selector); - u32x append2_t[4]; - - append2_t[0] = __byte_perm (append1[3], append2[0], selector); - append2_t[1] = __byte_perm (append2[0], append2[1], selector); - append2_t[2] = __byte_perm (append2[1], append2[2], selector); - append2_t[3] = __byte_perm (append2[2], append2[3], selector); - u32x append3_t[4]; - - append3_t[0] = __byte_perm (append2[3], append3[0], selector); - append3_t[1] = __byte_perm (append3[0], append3[1], selector); - append3_t[2] = __byte_perm (append3[1], append3[2], selector); - append3_t[3] = __byte_perm (append3[2], append3[3], selector); - u32x append4_t[4]; - append4_t[0] = __byte_perm (append3[3], 0, selector); + append0_t[0] = __byte_perm ( 0, append00, selector); + append0_t[1] = __byte_perm (append00, append01, selector); + append0_t[2] = __byte_perm (append01, append02, selector); + append0_t[3] = __byte_perm (append02, append03, selector); + append1_t[0] = __byte_perm (append03, append10, selector); + append1_t[1] = __byte_perm (append10, append11, selector); + append1_t[2] = __byte_perm (append11, append12, selector); + append1_t[3] = __byte_perm (append12, append13, selector); + append2_t[0] = __byte_perm (append13, append20, selector); + append2_t[1] = __byte_perm (append20, append21, selector); + append2_t[2] = __byte_perm (append21, append22, selector); + append2_t[3] = __byte_perm (append22, append23, selector); + append3_t[0] = __byte_perm (append23, append30, selector); + append3_t[1] = __byte_perm (append30, append31, selector); + append3_t[2] = __byte_perm (append31, append32, selector); + append3_t[3] = __byte_perm (append32, append33, selector); + append4_t[0] = __byte_perm (append33, 0, selector); append4_t[1] = 0; append4_t[2] = 0; append4_t[3] = 0; + #endif switch (div) @@ -145,22 +158,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 1] = append0_t[1]; block0[ 2] = append0_t[2]; block0[ 3] = append0_t[3]; - block0[ 4] = append1_t[0]; block0[ 5] = append1_t[1]; block0[ 6] = append1_t[2]; block0[ 7] = append1_t[3]; - block0[ 8] = append2_t[0]; block0[ 9] = append2_t[1]; block0[10] = append2_t[2]; block0[11] = append2_t[3]; - block0[12] = append3_t[0]; block0[13] = append3_t[1]; block0[14] = append3_t[2]; block0[15] = append3_t[3]; - block1[ 0] = append4_t[0]; block1[ 1] = append4_t[1]; block1[ 2] = append4_t[2]; @@ -171,22 +180,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 2] = append0_t[1]; block0[ 3] = append0_t[2]; block0[ 4] = append0_t[3]; - block0[ 5] = append1_t[0]; block0[ 6] = append1_t[1]; block0[ 7] = append1_t[2]; block0[ 8] = append1_t[3]; - block0[ 9] = append2_t[0]; block0[10] = append2_t[1]; block0[11] = append2_t[2]; block0[12] = append2_t[3]; - block0[13] = append3_t[0]; block0[14] = append3_t[1]; block0[15] = append3_t[2]; block1[ 0] = append3_t[3]; - block1[ 1] = append4_t[0]; block1[ 2] = append4_t[1]; block1[ 3] = append4_t[2]; @@ -197,22 +202,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 3] = append0_t[1]; block0[ 4] = append0_t[2]; block0[ 5] = append0_t[3]; - block0[ 6] = append1_t[0]; block0[ 7] = append1_t[1]; block0[ 8] = append1_t[2]; block0[ 9] = append1_t[3]; - block0[10] = append2_t[0]; block0[11] = append2_t[1]; block0[12] = append2_t[2]; block0[13] = append2_t[3]; - block0[14] = append3_t[0]; block0[15] = append3_t[1]; block1[ 0] = append3_t[2]; block1[ 1] = append3_t[3]; - block1[ 2] = append4_t[0]; block1[ 3] = append4_t[1]; block1[ 4] = append4_t[2]; @@ -223,22 +224,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 4] = append0_t[1]; block0[ 5] = append0_t[2]; block0[ 6] = append0_t[3]; - block0[ 7] = append1_t[0]; block0[ 8] = append1_t[1]; block0[ 9] = append1_t[2]; block0[10] = append1_t[3]; - block0[11] = append2_t[0]; block0[12] = append2_t[1]; block0[13] = append2_t[2]; block0[14] = append2_t[3]; - block0[15] = append3_t[0]; block1[ 0] = append3_t[1]; block1[ 1] = append3_t[2]; block1[ 2] = append3_t[3]; - block1[ 3] = append4_t[0]; block1[ 4] = append4_t[1]; block1[ 5] = append4_t[2]; @@ -249,22 +246,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 5] = append0_t[1]; block0[ 6] = append0_t[2]; block0[ 7] = append0_t[3]; - block0[ 8] = append1_t[0]; block0[ 9] = append1_t[1]; block0[10] = append1_t[2]; block0[11] = append1_t[3]; - block0[12] = append2_t[0]; block0[13] = append2_t[1]; block0[14] = append2_t[2]; block0[15] = append2_t[3]; - block1[ 0] = append3_t[0]; block1[ 1] = append3_t[1]; block1[ 2] = append3_t[2]; block1[ 3] = append3_t[3]; - block1[ 4] = append4_t[0]; block1[ 5] = append4_t[1]; block1[ 6] = append4_t[2]; @@ -275,22 +268,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 6] = append0_t[1]; block0[ 7] = append0_t[2]; block0[ 8] = append0_t[3]; - block0[ 9] = append1_t[0]; block0[10] = append1_t[1]; block0[11] = append1_t[2]; block0[12] = append1_t[3]; - block0[13] = append2_t[0]; block0[14] = append2_t[1]; block0[15] = append2_t[2]; block1[ 0] = append2_t[3]; - block1[ 1] = append3_t[0]; block1[ 2] = append3_t[1]; block1[ 3] = append3_t[2]; block1[ 4] = append3_t[3]; - block1[ 5] = append4_t[0]; block1[ 6] = append4_t[1]; block1[ 7] = append4_t[2]; @@ -301,22 +290,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 7] = append0_t[1]; block0[ 8] = append0_t[2]; block0[ 9] = append0_t[3]; - block0[10] = append1_t[0]; block0[11] = append1_t[1]; block0[12] = append1_t[2]; block0[13] = append1_t[3]; - block0[14] = append2_t[0]; block0[15] = append2_t[1]; block1[ 0] = append2_t[2]; block1[ 1] = append2_t[3]; - block1[ 2] = append3_t[0]; block1[ 3] = append3_t[1]; block1[ 4] = append3_t[2]; block1[ 5] = append3_t[3]; - block1[ 6] = append4_t[0]; block1[ 7] = append4_t[1]; block1[ 8] = append4_t[2]; @@ -327,22 +312,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 8] = append0_t[1]; block0[ 9] = append0_t[2]; block0[10] = append0_t[3]; - block0[11] = append1_t[0]; block0[12] = append1_t[1]; block0[13] = append1_t[2]; block0[14] = append1_t[3]; - block0[15] = append2_t[0]; block1[ 0] = append2_t[1]; block1[ 1] = append2_t[2]; block1[ 2] = append2_t[3]; - block1[ 3] = append3_t[0]; block1[ 4] = append3_t[1]; block1[ 5] = append3_t[2]; block1[ 6] = append3_t[3]; - block1[ 7] = append4_t[0]; block1[ 8] = append4_t[1]; block1[ 9] = append4_t[2]; @@ -353,22 +334,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 9] = append0_t[1]; block0[10] = append0_t[2]; block0[11] = append0_t[3]; - block0[12] = append1_t[0]; block0[13] = append1_t[1]; block0[14] = append1_t[2]; block0[15] = append1_t[3]; - block1[ 0] = append2_t[0]; block1[ 1] = append2_t[1]; block1[ 2] = append2_t[2]; block1[ 3] = append2_t[3]; - block1[ 4] = append3_t[0]; block1[ 5] = append3_t[1]; block1[ 6] = append3_t[2]; block1[ 7] = append3_t[3]; - block1[ 8] = append4_t[0]; block1[ 9] = append4_t[1]; block1[10] = append4_t[2]; @@ -379,22 +356,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[10] = append0_t[1]; block0[11] = append0_t[2]; block0[12] = append0_t[3]; - block0[13] = append1_t[0]; block0[14] = append1_t[1]; block0[15] = append1_t[2]; block1[ 0] = append1_t[3]; - block1[ 1] = append2_t[0]; block1[ 2] = append2_t[1]; block1[ 3] = append2_t[2]; block1[ 4] = append2_t[3]; - block1[ 5] = append3_t[0]; block1[ 6] = append3_t[1]; block1[ 7] = append3_t[2]; block1[ 8] = append3_t[3]; - block1[ 9] = append4_t[0]; block1[10] = append4_t[1]; block1[11] = append4_t[2]; @@ -405,22 +378,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[11] = append0_t[1]; block0[12] = append0_t[2]; block0[13] = append0_t[3]; - block0[14] = append1_t[0]; block0[15] = append1_t[1]; block1[ 0] = append1_t[2]; block1[ 1] = append1_t[3]; - block1[ 2] = append2_t[0]; block1[ 3] = append2_t[1]; block1[ 4] = append2_t[2]; block1[ 5] = append2_t[3]; - block1[ 6] = append3_t[0]; block1[ 7] = append3_t[1]; block1[ 8] = append3_t[2]; block1[ 9] = append3_t[3]; - block1[10] = append4_t[0]; block1[11] = append4_t[1]; block1[12] = append4_t[2]; @@ -431,22 +400,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[12] = append0_t[1]; block0[13] = append0_t[2]; block0[14] = append0_t[3]; - block0[15] = append1_t[0]; block1[ 0] = append1_t[1]; block1[ 1] = append1_t[2]; block1[ 2] = append1_t[3]; - block1[ 3] = append2_t[0]; block1[ 4] = append2_t[1]; block1[ 5] = append2_t[2]; block1[ 6] = append2_t[3]; - block1[ 7] = append3_t[0]; block1[ 8] = append3_t[1]; block1[ 9] = append3_t[2]; block1[10] = append3_t[3]; - block1[11] = append4_t[0]; block1[12] = append4_t[1]; block1[13] = append4_t[2]; @@ -457,22 +422,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[13] = append0_t[1]; block0[14] = append0_t[2]; block0[15] = append0_t[3]; - block1[ 0] = append1_t[0]; block1[ 1] = append1_t[1]; block1[ 2] = append1_t[2]; block1[ 3] = append1_t[3]; - block1[ 4] = append2_t[0]; block1[ 5] = append2_t[1]; block1[ 6] = append2_t[2]; block1[ 7] = append2_t[3]; - block1[ 8] = append3_t[0]; block1[ 9] = append3_t[1]; block1[10] = append3_t[2]; block1[11] = append3_t[3]; - block1[12] = append4_t[0]; block1[13] = append4_t[1]; block1[14] = append4_t[2]; @@ -483,22 +444,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[14] = append0_t[1]; block0[15] = append0_t[2]; block1[ 0] = append0_t[3]; - block1[ 1] = append1_t[0]; block1[ 2] = append1_t[1]; block1[ 3] = append1_t[2]; block1[ 4] = append1_t[3]; - block1[ 5] = append2_t[0]; block1[ 6] = append2_t[1]; block1[ 7] = append2_t[2]; block1[ 8] = append2_t[3]; - block1[ 9] = append3_t[0]; block1[10] = append3_t[1]; block1[11] = append3_t[2]; block1[12] = append3_t[3]; - block1[13] = append4_t[0]; block1[14] = append4_t[1]; block1[15] = append4_t[2]; @@ -508,22 +465,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[15] = append0_t[1]; block1[ 0] = append0_t[2]; block1[ 1] = append0_t[3]; - block1[ 2] = append1_t[0]; block1[ 3] = append1_t[1]; block1[ 4] = append1_t[2]; block1[ 5] = append1_t[3]; - block1[ 6] = append2_t[0]; block1[ 7] = append2_t[1]; block1[ 8] = append2_t[2]; block1[ 9] = append2_t[3]; - block1[10] = append3_t[0]; block1[11] = append3_t[1]; block1[12] = append3_t[2]; block1[13] = append3_t[3]; - block1[14] = append4_t[0]; block1[15] = append4_t[1]; break; @@ -532,22 +485,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 0] = append0_t[1]; block1[ 1] = append0_t[2]; block1[ 2] = append0_t[3]; - block1[ 3] = append1_t[1]; block1[ 4] = append1_t[2]; block1[ 5] = append1_t[3]; block1[ 6] = append1_t[0]; - block1[ 7] = append2_t[0]; block1[ 8] = append2_t[1]; block1[ 9] = append2_t[2]; block1[10] = append2_t[3]; - block1[11] = append3_t[0]; block1[12] = append3_t[1]; block1[13] = append3_t[2]; block1[14] = append3_t[3]; - block1[15] = append4_t[0]; break; @@ -555,17 +504,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 1] = append0_t[1]; block1[ 2] = append0_t[2]; block1[ 3] = append0_t[3]; - block1[ 4] = append1_t[0]; block1[ 5] = append1_t[1]; block1[ 6] = append1_t[2]; block1[ 7] = append1_t[3]; - block1[ 8] = append2_t[0]; block1[ 9] = append2_t[1]; block1[10] = append2_t[2]; block1[11] = append2_t[3]; - block1[12] = append3_t[0]; block1[13] = append3_t[1]; block1[14] = append3_t[2]; @@ -576,17 +522,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 2] = append0_t[1]; block1[ 3] = append0_t[2]; block1[ 4] = append0_t[3]; - block1[ 5] = append1_t[0]; block1[ 6] = append1_t[1]; block1[ 7] = append1_t[2]; block1[ 8] = append1_t[3]; - block1[ 9] = append2_t[0]; block1[10] = append2_t[1]; block1[11] = append2_t[2]; block1[12] = append2_t[3]; - block1[13] = append3_t[0]; block1[14] = append3_t[1]; block1[15] = append3_t[2]; @@ -596,17 +539,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 3] = append0_t[1]; block1[ 4] = append0_t[2]; block1[ 5] = append0_t[3]; - block1[ 6] = append1_t[0]; block1[ 7] = append1_t[1]; block1[ 8] = append1_t[2]; block1[ 9] = append1_t[3]; - block1[10] = append2_t[0]; block1[11] = append2_t[1]; block1[12] = append2_t[2]; block1[13] = append2_t[3]; - block1[14] = append3_t[0]; block1[15] = append3_t[1]; break; @@ -615,17 +555,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 4] = append0_t[1]; block1[ 5] = append0_t[2]; block1[ 6] = append0_t[3]; - block1[ 7] = append1_t[0]; block1[ 8] = append1_t[1]; block1[ 9] = append1_t[2]; block1[10] = append1_t[3]; - block1[11] = append2_t[0]; block1[12] = append2_t[1]; block1[13] = append2_t[2]; block1[14] = append2_t[3]; - block1[15] = append3_t[0]; break; @@ -633,12 +570,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 5] = append0_t[1]; block1[ 6] = append0_t[2]; block1[ 7] = append0_t[3]; - block1[ 8] = append1_t[0]; block1[ 9] = append1_t[1]; block1[10] = append1_t[2]; block1[11] = append1_t[3]; - block1[12] = append2_t[0]; block1[13] = append2_t[1]; block1[14] = append2_t[2]; @@ -649,12 +584,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 6] = append0_t[1]; block1[ 7] = append0_t[2]; block1[ 8] = append0_t[3]; - block1[ 9] = append1_t[0]; block1[10] = append1_t[1]; block1[11] = append1_t[2]; block1[12] = append1_t[3]; - block1[13] = append2_t[0]; block1[14] = append2_t[1]; block1[15] = append2_t[2]; @@ -664,12 +597,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 7] = append0_t[1]; block1[ 8] = append0_t[2]; block1[ 9] = append0_t[3]; - block1[10] = append1_t[0]; block1[11] = append1_t[1]; block1[12] = append1_t[2]; block1[13] = append1_t[3]; - block1[14] = append2_t[0]; block1[15] = append2_t[1]; break; @@ -678,12 +609,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 8] = append0_t[1]; block1[ 9] = append0_t[2]; block1[10] = append0_t[3]; - block1[11] = append1_t[0]; block1[12] = append1_t[1]; block1[13] = append1_t[2]; block1[14] = append1_t[3]; - block1[15] = append2_t[0]; break; @@ -691,7 +620,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 9] = append0_t[1]; block1[10] = append0_t[2]; block1[11] = append0_t[3]; - block1[12] = append1_t[0]; block1[13] = append1_t[1]; block1[14] = append1_t[2]; @@ -702,7 +630,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[10] = append0_t[1]; block1[11] = append0_t[2]; block1[12] = append0_t[3]; - block1[13] = append1_t[0]; block1[14] = append1_t[1]; block1[15] = append1_t[2]; @@ -712,7 +639,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[11] = append0_t[1]; block1[12] = append0_t[2]; block1[13] = append0_t[3]; - block1[14] = append1_t[0]; block1[15] = append1_t[1]; break; @@ -721,7 +647,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[12] = append0_t[1]; block1[13] = append0_t[2]; block1[14] = append0_t[3]; - block1[15] = append1_t[0]; break; @@ -741,7 +666,7 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x break; } - u32 new_len = block_len + append_len; + u32 new_len = offset + append_len; return new_len; } diff --git a/OpenCL/m11400_a3-optimized.cl b/OpenCL/m11400_a3-optimized.cl index efcffb31e..4e883375d 100644 --- a/OpenCL/m11400_a3-optimized.cl +++ b/OpenCL/m11400_a3-optimized.cl @@ -25,76 +25,79 @@ #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif -u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len) +u32 memcat32 (u32x block0[16], u32x block1[16], const u32 offset, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len) { - const u32 mod = block_len & 3; - const u32 div = block_len / 4; + const u32 mod = offset & 3; + const u32 div = offset / 4; #if defined IS_AMD || defined IS_GENERIC const int offset_minus_4 = 4 - mod; + u32x append00 = swap32 (append0[0]); + u32x append01 = swap32 (append0[1]); + u32x append02 = swap32 (append0[2]); + u32x append03 = swap32 (append0[3]); + u32x append10 = swap32 (append1[0]); + u32x append11 = swap32 (append1[1]); + u32x append12 = swap32 (append1[2]); + u32x append13 = swap32 (append1[3]); + u32x append20 = swap32 (append2[0]); + u32x append21 = swap32 (append2[1]); + u32x append22 = swap32 (append2[2]); + u32x append23 = swap32 (append2[3]); + u32x append30 = swap32 (append3[0]); + u32x append31 = swap32 (append3[1]); + u32x append32 = swap32 (append3[2]); + u32x append33 = swap32 (append3[3]); + u32x append0_t[4]; - - append0_t[0] = amd_bytealign (append0[0], 0, offset_minus_4); - append0_t[1] = amd_bytealign (append0[1], append0[0], offset_minus_4); - append0_t[2] = amd_bytealign (append0[2], append0[1], offset_minus_4); - append0_t[3] = amd_bytealign (append0[3], append0[2], offset_minus_4); - u32x append1_t[4]; - - append1_t[0] = amd_bytealign (append1[0], append0[3], offset_minus_4); - append1_t[1] = amd_bytealign (append1[1], append1[0], offset_minus_4); - append1_t[2] = amd_bytealign (append1[2], append1[1], offset_minus_4); - append1_t[3] = amd_bytealign (append1[3], append1[2], offset_minus_4); - u32x append2_t[4]; - - append2_t[0] = amd_bytealign (append2[0], append1[3], offset_minus_4); - append2_t[1] = amd_bytealign (append2[1], append2[0], offset_minus_4); - append2_t[2] = amd_bytealign (append2[2], append2[1], offset_minus_4); - append2_t[3] = amd_bytealign (append2[3], append2[2], offset_minus_4); - u32x append3_t[4]; - - append3_t[0] = amd_bytealign (append3[0], append2[3], offset_minus_4); - append3_t[1] = amd_bytealign (append3[1], append3[0], offset_minus_4); - append3_t[2] = amd_bytealign (append3[2], append3[1], offset_minus_4); - append3_t[3] = amd_bytealign (append3[3], append3[2], offset_minus_4); - u32x append4_t[4]; - append4_t[0] = amd_bytealign ( 0, append3[3], offset_minus_4); + append0_t[0] = amd_bytealign ( 0, append00, offset); + append0_t[1] = amd_bytealign (append00, append01, offset); + append0_t[2] = amd_bytealign (append01, append02, offset); + append0_t[3] = amd_bytealign (append02, append03, offset); + append1_t[0] = amd_bytealign (append03, append10, offset); + append1_t[1] = amd_bytealign (append10, append11, offset); + append1_t[2] = amd_bytealign (append11, append12, offset); + append1_t[3] = amd_bytealign (append12, append13, offset); + append2_t[0] = amd_bytealign (append13, append20, offset); + append2_t[1] = amd_bytealign (append20, append21, offset); + append2_t[2] = amd_bytealign (append21, append22, offset); + append2_t[3] = amd_bytealign (append22, append23, offset); + append3_t[0] = amd_bytealign (append23, append30, offset); + append3_t[1] = amd_bytealign (append30, append31, offset); + append3_t[2] = amd_bytealign (append31, append32, offset); + append3_t[3] = amd_bytealign (append32, append33, offset); + append4_t[0] = amd_bytealign (append33, 0, offset); append4_t[1] = 0; append4_t[2] = 0; append4_t[3] = 0; - if (mod == 0) - { - append0_t[0] = append0[0]; - append0_t[1] = append0[1]; - append0_t[2] = append0[2]; - append0_t[3] = append0[3]; + append0_t[0] = swap32 (append0_t[0]); + append0_t[1] = swap32 (append0_t[1]); + append0_t[2] = swap32 (append0_t[2]); + append0_t[3] = swap32 (append0_t[3]); + append1_t[0] = swap32 (append1_t[0]); + append1_t[1] = swap32 (append1_t[1]); + append1_t[2] = swap32 (append1_t[2]); + append1_t[3] = swap32 (append1_t[3]); + append2_t[0] = swap32 (append2_t[0]); + append2_t[1] = swap32 (append2_t[1]); + append2_t[2] = swap32 (append2_t[2]); + append2_t[3] = swap32 (append2_t[3]); + append3_t[0] = swap32 (append3_t[0]); + append3_t[1] = swap32 (append3_t[1]); + append3_t[2] = swap32 (append3_t[2]); + append3_t[3] = swap32 (append3_t[3]); + append4_t[0] = swap32 (append4_t[0]); + append4_t[1] = swap32 (append4_t[1]); + append4_t[2] = swap32 (append4_t[2]); + append4_t[3] = swap32 (append4_t[3]); - append1_t[0] = append1[0]; - append1_t[1] = append1[1]; - append1_t[2] = append1[2]; - append1_t[3] = append1[3]; - - append2_t[0] = append2[0]; - append2_t[1] = append2[1]; - append2_t[2] = append2[2]; - append2_t[3] = append2[3]; - - append3_t[0] = append3[0]; - append3_t[1] = append3[1]; - append3_t[2] = append3[2]; - append3_t[3] = append3[3]; - - append4_t[0] = 0; - append4_t[1] = 0; - append4_t[2] = 0; - append4_t[3] = 0; - } #endif #ifdef IS_NV @@ -103,40 +106,50 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + u32x append00 = append0[0]; + u32x append01 = append0[1]; + u32x append02 = append0[2]; + u32x append03 = append0[3]; + u32x append10 = append1[0]; + u32x append11 = append1[1]; + u32x append12 = append1[2]; + u32x append13 = append1[3]; + u32x append20 = append2[0]; + u32x append21 = append2[1]; + u32x append22 = append2[2]; + u32x append23 = append2[3]; + u32x append30 = append3[0]; + u32x append31 = append3[1]; + u32x append32 = append3[2]; + u32x append33 = append3[3]; + u32x append0_t[4]; - - append0_t[0] = __byte_perm ( 0, append0[0], selector); - append0_t[1] = __byte_perm (append0[0], append0[1], selector); - append0_t[2] = __byte_perm (append0[1], append0[2], selector); - append0_t[3] = __byte_perm (append0[2], append0[3], selector); - u32x append1_t[4]; - - append1_t[0] = __byte_perm (append0[3], append1[0], selector); - append1_t[1] = __byte_perm (append1[0], append1[1], selector); - append1_t[2] = __byte_perm (append1[1], append1[2], selector); - append1_t[3] = __byte_perm (append1[2], append1[3], selector); - u32x append2_t[4]; - - append2_t[0] = __byte_perm (append1[3], append2[0], selector); - append2_t[1] = __byte_perm (append2[0], append2[1], selector); - append2_t[2] = __byte_perm (append2[1], append2[2], selector); - append2_t[3] = __byte_perm (append2[2], append2[3], selector); - u32x append3_t[4]; - - append3_t[0] = __byte_perm (append2[3], append3[0], selector); - append3_t[1] = __byte_perm (append3[0], append3[1], selector); - append3_t[2] = __byte_perm (append3[1], append3[2], selector); - append3_t[3] = __byte_perm (append3[2], append3[3], selector); - u32x append4_t[4]; - append4_t[0] = __byte_perm (append3[3], 0, selector); + append0_t[0] = __byte_perm ( 0, append00, selector); + append0_t[1] = __byte_perm (append00, append01, selector); + append0_t[2] = __byte_perm (append01, append02, selector); + append0_t[3] = __byte_perm (append02, append03, selector); + append1_t[0] = __byte_perm (append03, append10, selector); + append1_t[1] = __byte_perm (append10, append11, selector); + append1_t[2] = __byte_perm (append11, append12, selector); + append1_t[3] = __byte_perm (append12, append13, selector); + append2_t[0] = __byte_perm (append13, append20, selector); + append2_t[1] = __byte_perm (append20, append21, selector); + append2_t[2] = __byte_perm (append21, append22, selector); + append2_t[3] = __byte_perm (append22, append23, selector); + append3_t[0] = __byte_perm (append23, append30, selector); + append3_t[1] = __byte_perm (append30, append31, selector); + append3_t[2] = __byte_perm (append31, append32, selector); + append3_t[3] = __byte_perm (append32, append33, selector); + append4_t[0] = __byte_perm (append33, 0, selector); append4_t[1] = 0; append4_t[2] = 0; append4_t[3] = 0; + #endif switch (div) @@ -145,22 +158,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 1] = append0_t[1]; block0[ 2] = append0_t[2]; block0[ 3] = append0_t[3]; - block0[ 4] = append1_t[0]; block0[ 5] = append1_t[1]; block0[ 6] = append1_t[2]; block0[ 7] = append1_t[3]; - block0[ 8] = append2_t[0]; block0[ 9] = append2_t[1]; block0[10] = append2_t[2]; block0[11] = append2_t[3]; - block0[12] = append3_t[0]; block0[13] = append3_t[1]; block0[14] = append3_t[2]; block0[15] = append3_t[3]; - block1[ 0] = append4_t[0]; block1[ 1] = append4_t[1]; block1[ 2] = append4_t[2]; @@ -171,22 +180,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 2] = append0_t[1]; block0[ 3] = append0_t[2]; block0[ 4] = append0_t[3]; - block0[ 5] = append1_t[0]; block0[ 6] = append1_t[1]; block0[ 7] = append1_t[2]; block0[ 8] = append1_t[3]; - block0[ 9] = append2_t[0]; block0[10] = append2_t[1]; block0[11] = append2_t[2]; block0[12] = append2_t[3]; - block0[13] = append3_t[0]; block0[14] = append3_t[1]; block0[15] = append3_t[2]; block1[ 0] = append3_t[3]; - block1[ 1] = append4_t[0]; block1[ 2] = append4_t[1]; block1[ 3] = append4_t[2]; @@ -197,22 +202,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 3] = append0_t[1]; block0[ 4] = append0_t[2]; block0[ 5] = append0_t[3]; - block0[ 6] = append1_t[0]; block0[ 7] = append1_t[1]; block0[ 8] = append1_t[2]; block0[ 9] = append1_t[3]; - block0[10] = append2_t[0]; block0[11] = append2_t[1]; block0[12] = append2_t[2]; block0[13] = append2_t[3]; - block0[14] = append3_t[0]; block0[15] = append3_t[1]; block1[ 0] = append3_t[2]; block1[ 1] = append3_t[3]; - block1[ 2] = append4_t[0]; block1[ 3] = append4_t[1]; block1[ 4] = append4_t[2]; @@ -223,22 +224,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 4] = append0_t[1]; block0[ 5] = append0_t[2]; block0[ 6] = append0_t[3]; - block0[ 7] = append1_t[0]; block0[ 8] = append1_t[1]; block0[ 9] = append1_t[2]; block0[10] = append1_t[3]; - block0[11] = append2_t[0]; block0[12] = append2_t[1]; block0[13] = append2_t[2]; block0[14] = append2_t[3]; - block0[15] = append3_t[0]; block1[ 0] = append3_t[1]; block1[ 1] = append3_t[2]; block1[ 2] = append3_t[3]; - block1[ 3] = append4_t[0]; block1[ 4] = append4_t[1]; block1[ 5] = append4_t[2]; @@ -249,22 +246,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 5] = append0_t[1]; block0[ 6] = append0_t[2]; block0[ 7] = append0_t[3]; - block0[ 8] = append1_t[0]; block0[ 9] = append1_t[1]; block0[10] = append1_t[2]; block0[11] = append1_t[3]; - block0[12] = append2_t[0]; block0[13] = append2_t[1]; block0[14] = append2_t[2]; block0[15] = append2_t[3]; - block1[ 0] = append3_t[0]; block1[ 1] = append3_t[1]; block1[ 2] = append3_t[2]; block1[ 3] = append3_t[3]; - block1[ 4] = append4_t[0]; block1[ 5] = append4_t[1]; block1[ 6] = append4_t[2]; @@ -275,22 +268,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 6] = append0_t[1]; block0[ 7] = append0_t[2]; block0[ 8] = append0_t[3]; - block0[ 9] = append1_t[0]; block0[10] = append1_t[1]; block0[11] = append1_t[2]; block0[12] = append1_t[3]; - block0[13] = append2_t[0]; block0[14] = append2_t[1]; block0[15] = append2_t[2]; block1[ 0] = append2_t[3]; - block1[ 1] = append3_t[0]; block1[ 2] = append3_t[1]; block1[ 3] = append3_t[2]; block1[ 4] = append3_t[3]; - block1[ 5] = append4_t[0]; block1[ 6] = append4_t[1]; block1[ 7] = append4_t[2]; @@ -301,22 +290,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 7] = append0_t[1]; block0[ 8] = append0_t[2]; block0[ 9] = append0_t[3]; - block0[10] = append1_t[0]; block0[11] = append1_t[1]; block0[12] = append1_t[2]; block0[13] = append1_t[3]; - block0[14] = append2_t[0]; block0[15] = append2_t[1]; block1[ 0] = append2_t[2]; block1[ 1] = append2_t[3]; - block1[ 2] = append3_t[0]; block1[ 3] = append3_t[1]; block1[ 4] = append3_t[2]; block1[ 5] = append3_t[3]; - block1[ 6] = append4_t[0]; block1[ 7] = append4_t[1]; block1[ 8] = append4_t[2]; @@ -327,22 +312,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 8] = append0_t[1]; block0[ 9] = append0_t[2]; block0[10] = append0_t[3]; - block0[11] = append1_t[0]; block0[12] = append1_t[1]; block0[13] = append1_t[2]; block0[14] = append1_t[3]; - block0[15] = append2_t[0]; block1[ 0] = append2_t[1]; block1[ 1] = append2_t[2]; block1[ 2] = append2_t[3]; - block1[ 3] = append3_t[0]; block1[ 4] = append3_t[1]; block1[ 5] = append3_t[2]; block1[ 6] = append3_t[3]; - block1[ 7] = append4_t[0]; block1[ 8] = append4_t[1]; block1[ 9] = append4_t[2]; @@ -353,22 +334,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 9] = append0_t[1]; block0[10] = append0_t[2]; block0[11] = append0_t[3]; - block0[12] = append1_t[0]; block0[13] = append1_t[1]; block0[14] = append1_t[2]; block0[15] = append1_t[3]; - block1[ 0] = append2_t[0]; block1[ 1] = append2_t[1]; block1[ 2] = append2_t[2]; block1[ 3] = append2_t[3]; - block1[ 4] = append3_t[0]; block1[ 5] = append3_t[1]; block1[ 6] = append3_t[2]; block1[ 7] = append3_t[3]; - block1[ 8] = append4_t[0]; block1[ 9] = append4_t[1]; block1[10] = append4_t[2]; @@ -379,22 +356,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[10] = append0_t[1]; block0[11] = append0_t[2]; block0[12] = append0_t[3]; - block0[13] = append1_t[0]; block0[14] = append1_t[1]; block0[15] = append1_t[2]; block1[ 0] = append1_t[3]; - block1[ 1] = append2_t[0]; block1[ 2] = append2_t[1]; block1[ 3] = append2_t[2]; block1[ 4] = append2_t[3]; - block1[ 5] = append3_t[0]; block1[ 6] = append3_t[1]; block1[ 7] = append3_t[2]; block1[ 8] = append3_t[3]; - block1[ 9] = append4_t[0]; block1[10] = append4_t[1]; block1[11] = append4_t[2]; @@ -405,22 +378,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[11] = append0_t[1]; block0[12] = append0_t[2]; block0[13] = append0_t[3]; - block0[14] = append1_t[0]; block0[15] = append1_t[1]; block1[ 0] = append1_t[2]; block1[ 1] = append1_t[3]; - block1[ 2] = append2_t[0]; block1[ 3] = append2_t[1]; block1[ 4] = append2_t[2]; block1[ 5] = append2_t[3]; - block1[ 6] = append3_t[0]; block1[ 7] = append3_t[1]; block1[ 8] = append3_t[2]; block1[ 9] = append3_t[3]; - block1[10] = append4_t[0]; block1[11] = append4_t[1]; block1[12] = append4_t[2]; @@ -431,22 +400,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[12] = append0_t[1]; block0[13] = append0_t[2]; block0[14] = append0_t[3]; - block0[15] = append1_t[0]; block1[ 0] = append1_t[1]; block1[ 1] = append1_t[2]; block1[ 2] = append1_t[3]; - block1[ 3] = append2_t[0]; block1[ 4] = append2_t[1]; block1[ 5] = append2_t[2]; block1[ 6] = append2_t[3]; - block1[ 7] = append3_t[0]; block1[ 8] = append3_t[1]; block1[ 9] = append3_t[2]; block1[10] = append3_t[3]; - block1[11] = append4_t[0]; block1[12] = append4_t[1]; block1[13] = append4_t[2]; @@ -457,22 +422,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[13] = append0_t[1]; block0[14] = append0_t[2]; block0[15] = append0_t[3]; - block1[ 0] = append1_t[0]; block1[ 1] = append1_t[1]; block1[ 2] = append1_t[2]; block1[ 3] = append1_t[3]; - block1[ 4] = append2_t[0]; block1[ 5] = append2_t[1]; block1[ 6] = append2_t[2]; block1[ 7] = append2_t[3]; - block1[ 8] = append3_t[0]; block1[ 9] = append3_t[1]; block1[10] = append3_t[2]; block1[11] = append3_t[3]; - block1[12] = append4_t[0]; block1[13] = append4_t[1]; block1[14] = append4_t[2]; @@ -483,22 +444,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[14] = append0_t[1]; block0[15] = append0_t[2]; block1[ 0] = append0_t[3]; - block1[ 1] = append1_t[0]; block1[ 2] = append1_t[1]; block1[ 3] = append1_t[2]; block1[ 4] = append1_t[3]; - block1[ 5] = append2_t[0]; block1[ 6] = append2_t[1]; block1[ 7] = append2_t[2]; block1[ 8] = append2_t[3]; - block1[ 9] = append3_t[0]; block1[10] = append3_t[1]; block1[11] = append3_t[2]; block1[12] = append3_t[3]; - block1[13] = append4_t[0]; block1[14] = append4_t[1]; block1[15] = append4_t[2]; @@ -508,22 +465,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[15] = append0_t[1]; block1[ 0] = append0_t[2]; block1[ 1] = append0_t[3]; - block1[ 2] = append1_t[0]; block1[ 3] = append1_t[1]; block1[ 4] = append1_t[2]; block1[ 5] = append1_t[3]; - block1[ 6] = append2_t[0]; block1[ 7] = append2_t[1]; block1[ 8] = append2_t[2]; block1[ 9] = append2_t[3]; - block1[10] = append3_t[0]; block1[11] = append3_t[1]; block1[12] = append3_t[2]; block1[13] = append3_t[3]; - block1[14] = append4_t[0]; block1[15] = append4_t[1]; break; @@ -532,22 +485,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 0] = append0_t[1]; block1[ 1] = append0_t[2]; block1[ 2] = append0_t[3]; - block1[ 3] = append1_t[1]; block1[ 4] = append1_t[2]; block1[ 5] = append1_t[3]; block1[ 6] = append1_t[0]; - block1[ 7] = append2_t[0]; block1[ 8] = append2_t[1]; block1[ 9] = append2_t[2]; block1[10] = append2_t[3]; - block1[11] = append3_t[0]; block1[12] = append3_t[1]; block1[13] = append3_t[2]; block1[14] = append3_t[3]; - block1[15] = append4_t[0]; break; @@ -555,17 +504,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 1] = append0_t[1]; block1[ 2] = append0_t[2]; block1[ 3] = append0_t[3]; - block1[ 4] = append1_t[0]; block1[ 5] = append1_t[1]; block1[ 6] = append1_t[2]; block1[ 7] = append1_t[3]; - block1[ 8] = append2_t[0]; block1[ 9] = append2_t[1]; block1[10] = append2_t[2]; block1[11] = append2_t[3]; - block1[12] = append3_t[0]; block1[13] = append3_t[1]; block1[14] = append3_t[2]; @@ -576,17 +522,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 2] = append0_t[1]; block1[ 3] = append0_t[2]; block1[ 4] = append0_t[3]; - block1[ 5] = append1_t[0]; block1[ 6] = append1_t[1]; block1[ 7] = append1_t[2]; block1[ 8] = append1_t[3]; - block1[ 9] = append2_t[0]; block1[10] = append2_t[1]; block1[11] = append2_t[2]; block1[12] = append2_t[3]; - block1[13] = append3_t[0]; block1[14] = append3_t[1]; block1[15] = append3_t[2]; @@ -596,17 +539,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 3] = append0_t[1]; block1[ 4] = append0_t[2]; block1[ 5] = append0_t[3]; - block1[ 6] = append1_t[0]; block1[ 7] = append1_t[1]; block1[ 8] = append1_t[2]; block1[ 9] = append1_t[3]; - block1[10] = append2_t[0]; block1[11] = append2_t[1]; block1[12] = append2_t[2]; block1[13] = append2_t[3]; - block1[14] = append3_t[0]; block1[15] = append3_t[1]; break; @@ -615,17 +555,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 4] = append0_t[1]; block1[ 5] = append0_t[2]; block1[ 6] = append0_t[3]; - block1[ 7] = append1_t[0]; block1[ 8] = append1_t[1]; block1[ 9] = append1_t[2]; block1[10] = append1_t[3]; - block1[11] = append2_t[0]; block1[12] = append2_t[1]; block1[13] = append2_t[2]; block1[14] = append2_t[3]; - block1[15] = append3_t[0]; break; @@ -633,12 +570,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 5] = append0_t[1]; block1[ 6] = append0_t[2]; block1[ 7] = append0_t[3]; - block1[ 8] = append1_t[0]; block1[ 9] = append1_t[1]; block1[10] = append1_t[2]; block1[11] = append1_t[3]; - block1[12] = append2_t[0]; block1[13] = append2_t[1]; block1[14] = append2_t[2]; @@ -649,12 +584,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 6] = append0_t[1]; block1[ 7] = append0_t[2]; block1[ 8] = append0_t[3]; - block1[ 9] = append1_t[0]; block1[10] = append1_t[1]; block1[11] = append1_t[2]; block1[12] = append1_t[3]; - block1[13] = append2_t[0]; block1[14] = append2_t[1]; block1[15] = append2_t[2]; @@ -664,12 +597,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 7] = append0_t[1]; block1[ 8] = append0_t[2]; block1[ 9] = append0_t[3]; - block1[10] = append1_t[0]; block1[11] = append1_t[1]; block1[12] = append1_t[2]; block1[13] = append1_t[3]; - block1[14] = append2_t[0]; block1[15] = append2_t[1]; break; @@ -678,12 +609,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 8] = append0_t[1]; block1[ 9] = append0_t[2]; block1[10] = append0_t[3]; - block1[11] = append1_t[0]; block1[12] = append1_t[1]; block1[13] = append1_t[2]; block1[14] = append1_t[3]; - block1[15] = append2_t[0]; break; @@ -691,7 +620,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 9] = append0_t[1]; block1[10] = append0_t[2]; block1[11] = append0_t[3]; - block1[12] = append1_t[0]; block1[13] = append1_t[1]; block1[14] = append1_t[2]; @@ -702,7 +630,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[10] = append0_t[1]; block1[11] = append0_t[2]; block1[12] = append0_t[3]; - block1[13] = append1_t[0]; block1[14] = append1_t[1]; block1[15] = append1_t[2]; @@ -712,7 +639,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[11] = append0_t[1]; block1[12] = append0_t[2]; block1[13] = append0_t[3]; - block1[14] = append1_t[0]; block1[15] = append1_t[1]; break; @@ -721,7 +647,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[12] = append0_t[1]; block1[13] = append0_t[2]; block1[14] = append0_t[3]; - block1[15] = append1_t[0]; break; @@ -741,7 +666,7 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x break; } - u32 new_len = block_len + append_len; + u32 new_len = offset + append_len; return new_len; } diff --git a/OpenCL/m13800_a0-optimized.cl b/OpenCL/m13800_a0-optimized.cl index 03e63e556..c44bc74af 100644 --- a/OpenCL/m13800_a0-optimized.cl +++ b/OpenCL/m13800_a0-optimized.cl @@ -146,6 +146,26 @@ void memcat64c_be (u32x block[16], const u32 offset, u32x carry[16]) u32x tmp15; u32x tmp16; + #if defined IS_AMD || defined IS_GENERIC + tmp00 = amd_bytealign ( 0, carry[ 0], offset); + tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset); + tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset); + tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset); + tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset); + tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset); + tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset); + tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset); + tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset); + tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset); + tmp10 = amd_bytealign (carry[ 9], carry[10], offset); + tmp11 = amd_bytealign (carry[10], carry[11], offset); + tmp12 = amd_bytealign (carry[11], carry[12], offset); + tmp13 = amd_bytealign (carry[12], carry[13], offset); + tmp14 = amd_bytealign (carry[13], carry[14], offset); + tmp15 = amd_bytealign (carry[14], carry[15], offset); + tmp16 = amd_bytealign (carry[15], 0, offset); + #endif + #ifdef IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; @@ -168,26 +188,6 @@ void memcat64c_be (u32x block[16], const u32 offset, u32x carry[16]) tmp16 = __byte_perm ( 0, carry[15], selector); #endif - #if defined IS_AMD || defined IS_GENERIC - tmp00 = amd_bytealign ( 0, carry[ 0], offset); - tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset); - tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset); - tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset); - tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset); - tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset); - tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset); - tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset); - tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset); - tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset); - tmp10 = amd_bytealign (carry[ 9], carry[10], offset); - tmp11 = amd_bytealign (carry[10], carry[11], offset); - tmp12 = amd_bytealign (carry[11], carry[12], offset); - tmp13 = amd_bytealign (carry[12], carry[13], offset); - tmp14 = amd_bytealign (carry[13], carry[14], offset); - tmp15 = amd_bytealign (carry[14], carry[15], offset); - tmp16 = amd_bytealign (carry[15], 0, offset); - #endif - carry[ 0] = 0; carry[ 1] = 0; carry[ 2] = 0; diff --git a/OpenCL/m13800_a1-optimized.cl b/OpenCL/m13800_a1-optimized.cl index 2e557fae5..f6315b732 100644 --- a/OpenCL/m13800_a1-optimized.cl +++ b/OpenCL/m13800_a1-optimized.cl @@ -144,6 +144,26 @@ void memcat64c_be (u32x block[16], const u32 offset, u32x carry[16]) u32x tmp15; u32x tmp16; + #if defined IS_AMD || defined IS_GENERIC + tmp00 = amd_bytealign ( 0, carry[ 0], offset); + tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset); + tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset); + tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset); + tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset); + tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset); + tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset); + tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset); + tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset); + tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset); + tmp10 = amd_bytealign (carry[ 9], carry[10], offset); + tmp11 = amd_bytealign (carry[10], carry[11], offset); + tmp12 = amd_bytealign (carry[11], carry[12], offset); + tmp13 = amd_bytealign (carry[12], carry[13], offset); + tmp14 = amd_bytealign (carry[13], carry[14], offset); + tmp15 = amd_bytealign (carry[14], carry[15], offset); + tmp16 = amd_bytealign (carry[15], 0, offset); + #endif + #ifdef IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; @@ -166,26 +186,6 @@ void memcat64c_be (u32x block[16], const u32 offset, u32x carry[16]) tmp16 = __byte_perm ( 0, carry[15], selector); #endif - #if defined IS_AMD || defined IS_GENERIC - tmp00 = amd_bytealign ( 0, carry[ 0], offset); - tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset); - tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset); - tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset); - tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset); - tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset); - tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset); - tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset); - tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset); - tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset); - tmp10 = amd_bytealign (carry[ 9], carry[10], offset); - tmp11 = amd_bytealign (carry[10], carry[11], offset); - tmp12 = amd_bytealign (carry[11], carry[12], offset); - tmp13 = amd_bytealign (carry[12], carry[13], offset); - tmp14 = amd_bytealign (carry[13], carry[14], offset); - tmp15 = amd_bytealign (carry[14], carry[15], offset); - tmp16 = amd_bytealign (carry[15], 0, offset); - #endif - carry[ 0] = 0; carry[ 1] = 0; carry[ 2] = 0; diff --git a/OpenCL/m13800_a3-optimized.cl b/OpenCL/m13800_a3-optimized.cl index 85818a31e..618f7a130 100644 --- a/OpenCL/m13800_a3-optimized.cl +++ b/OpenCL/m13800_a3-optimized.cl @@ -143,6 +143,26 @@ void memcat64c_be (u32x block[16], const u32 offset, u32x carry[16]) u32x tmp15; u32x tmp16; + #if defined IS_AMD || defined IS_GENERIC + tmp00 = amd_bytealign ( 0, carry[ 0], offset); + tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset); + tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset); + tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset); + tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset); + tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset); + tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset); + tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset); + tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset); + tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset); + tmp10 = amd_bytealign (carry[ 9], carry[10], offset); + tmp11 = amd_bytealign (carry[10], carry[11], offset); + tmp12 = amd_bytealign (carry[11], carry[12], offset); + tmp13 = amd_bytealign (carry[12], carry[13], offset); + tmp14 = amd_bytealign (carry[13], carry[14], offset); + tmp15 = amd_bytealign (carry[14], carry[15], offset); + tmp16 = amd_bytealign (carry[15], 0, offset); + #endif + #ifdef IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; @@ -165,26 +185,6 @@ void memcat64c_be (u32x block[16], const u32 offset, u32x carry[16]) tmp16 = __byte_perm ( 0, carry[15], selector); #endif - #if defined IS_AMD || defined IS_GENERIC - tmp00 = amd_bytealign ( 0, carry[ 0], offset); - tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset); - tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset); - tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset); - tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset); - tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset); - tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset); - tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset); - tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset); - tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset); - tmp10 = amd_bytealign (carry[ 9], carry[10], offset); - tmp11 = amd_bytealign (carry[10], carry[11], offset); - tmp12 = amd_bytealign (carry[11], carry[12], offset); - tmp13 = amd_bytealign (carry[12], carry[13], offset); - tmp14 = amd_bytealign (carry[13], carry[14], offset); - tmp15 = amd_bytealign (carry[14], carry[15], offset); - tmp16 = amd_bytealign (carry[15], 0, offset); - #endif - carry[ 0] = 0; carry[ 1] = 0; carry[ 2] = 0;