From 5a1d929628721bc31ef8f91fb5599439c649cfd3 Mon Sep 17 00:00:00 2001 From: jsteube Date: Thu, 28 Mar 2019 12:26:24 +0100 Subject: [PATCH] Fix some missing code changes after hc_bytealign() was changed --- OpenCL/inc_common.cl | 1826 ++++++++++++------------------------ OpenCL/inc_rp_optimized.cl | 501 +++++----- 2 files changed, 816 insertions(+), 1511 deletions(-) diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl index 688dd8646..ae9da3c51 100644 --- a/OpenCL/inc_common.cl +++ b/OpenCL/inc_common.cl @@ -1070,45 +1070,49 @@ DECLSPEC u32x hc_bytealign (const u32x a, const u32x b, const int c) { u32x r; + const int c_mod_4 = c & 3; + + const int c_minus_4 = 4 - c_mod_4; + #if CUDA_ARCH >= 350 - const int c38 = (c & 3) * 8; + const int c38 = c_minus_4 * 8; #if VECT_SIZE == 1 - asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(b), "r"(a), "r"(c38)); + asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c38)); #endif #if VECT_SIZE >= 2 - asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s0) : "r"(b.s0), "r"(a.s0), "r"(c38)); - asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s1) : "r"(b.s1), "r"(a.s1), "r"(c38)); + asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s0) : "r"(a.s0), "r"(b.s0), "r"(c38)); + asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s1) : "r"(a.s1), "r"(b.s1), "r"(c38)); #endif #if VECT_SIZE >= 4 - asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s2) : "r"(b.s2), "r"(a.s2), "r"(c38)); - asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s3) : "r"(b.s3), "r"(a.s3), "r"(c38)); + asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s2) : "r"(a.s2), "r"(b.s2), "r"(c38)); + asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s3) : "r"(a.s3), "r"(b.s3), "r"(c38)); #endif #if VECT_SIZE >= 8 - asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s4) : "r"(b.s4), "r"(a.s4), "r"(c38)); - asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s5) : "r"(b.s5), "r"(a.s5), "r"(c38)); - asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s6) : "r"(b.s6), "r"(a.s6), "r"(c38)); - asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s7) : "r"(b.s7), "r"(a.s7), "r"(c38)); + asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s4) : "r"(a.s4), "r"(b.s4), "r"(c38)); + asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s5) : "r"(a.s5), "r"(b.s5), "r"(c38)); + asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s6) : "r"(a.s6), "r"(b.s6), "r"(c38)); + asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s7) : "r"(a.s7), "r"(b.s7), "r"(c38)); #endif #if VECT_SIZE >= 16 - asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s8) : "r"(b.s8), "r"(a.s8), "r"(c38)); - asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s9) : "r"(b.s9), "r"(a.s9), "r"(c38)); - asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.sa) : "r"(b.sa), "r"(a.sa), "r"(c38)); - asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.sb) : "r"(b.sb), "r"(a.sb), "r"(c38)); - asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.sc) : "r"(b.sc), "r"(a.sc), "r"(c38)); - asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.sd) : "r"(b.sd), "r"(a.sd), "r"(c38)); - asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.se) : "r"(b.se), "r"(a.se), "r"(c38)); - asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.sf) : "r"(b.sf), "r"(a.sf), "r"(c38)); + asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s8) : "r"(a.s8), "r"(b.s8), "r"(c38)); + asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.s9) : "r"(a.s9), "r"(b.s9), "r"(c38)); + asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.sa) : "r"(a.sa), "r"(b.sa), "r"(c38)); + asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.sb) : "r"(a.sb), "r"(b.sb), "r"(c38)); + asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.sc) : "r"(a.sc), "r"(b.sc), "r"(c38)); + asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.sd) : "r"(a.sd), "r"(b.sd), "r"(c38)); + asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.se) : "r"(a.se), "r"(b.se), "r"(c38)); + asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r.sf) : "r"(a.sf), "r"(b.sf), "r"(c38)); #endif #else - r = hc_byte_perm (b, a, (0x76543210 >> ((c & 3) * 4)) & 0xffff); + r = hc_byte_perm (a, b, (0x76543210 >> (c_minus_4 * 4)) & 0xffff); #endif @@ -1119,15 +1123,19 @@ DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const int c) { u32 r; + const int c_mod_4 = c & 3; + + const int c_minus_4 = 4 - c_mod_4; + #if CUDA_ARCH >= 350 - const int c38 = (c & 3) * 8; + const int c38 = c_minus_4 * 8; - asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(b), "r"(a), "r"(c38)); + asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c38)); #else - r = hc_byte_perm_S (b, a, (0x76543210 >> ((c & 3) * 4)) & 0xffff); + r = hc_byte_perm_S (a, b, (0x76543210 >> (c_minus_4 * 4)) & 0xffff); #endif @@ -3206,299 +3214,168 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 #endif #ifdef IS_NV - - const int offset_mod_4 = offset & 3; - - const int offset_minus_4 = 4 - offset_mod_4; - - // todo + // atm only same code as for AMD, but could be improved switch (offset_switch) { case 0: - c0[0] = hc_bytealign ( 0, w3[3], offset_minus_4); - w3[3] = hc_bytealign (w3[3], w3[2], offset_minus_4); - w3[2] = hc_bytealign (w3[2], w3[1], offset_minus_4); - w3[1] = hc_bytealign (w3[1], w3[0], offset_minus_4); - w3[0] = hc_bytealign (w3[0], w2[3], offset_minus_4); - w2[3] = hc_bytealign (w2[3], w2[2], offset_minus_4); - w2[2] = hc_bytealign (w2[2], w2[1], offset_minus_4); - w2[1] = hc_bytealign (w2[1], w2[0], offset_minus_4); - w2[0] = hc_bytealign (w2[0], w1[3], offset_minus_4); - w1[3] = hc_bytealign (w1[3], w1[2], offset_minus_4); - w1[2] = hc_bytealign (w1[2], w1[1], offset_minus_4); - w1[1] = hc_bytealign (w1[1], w1[0], offset_minus_4); - w1[0] = hc_bytealign (w1[0], w0[3], offset_minus_4); - w0[3] = hc_bytealign (w0[3], w0[2], offset_minus_4); - w0[2] = hc_bytealign (w0[2], w0[1], offset_minus_4); - w0[1] = hc_bytealign (w0[1], w0[0], offset_minus_4); - w0[0] = hc_bytealign (w0[0], 0, offset_minus_4); - - if (offset_mod_4 == 0) - { - w0[0] = w0[1]; - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = 0; - } + c0[0] = hc_bytealign (w3[3], 0, offset); + w3[3] = hc_bytealign (w3[2], w3[3], offset); + w3[2] = hc_bytealign (w3[1], w3[2], offset); + w3[1] = hc_bytealign (w3[0], w3[1], offset); + w3[0] = hc_bytealign (w2[3], w3[0], offset); + w2[3] = hc_bytealign (w2[2], w2[3], offset); + w2[2] = hc_bytealign (w2[1], w2[2], offset); + w2[1] = hc_bytealign (w2[0], w2[1], offset); + w2[0] = hc_bytealign (w1[3], w2[0], offset); + w1[3] = hc_bytealign (w1[2], w1[3], offset); + w1[2] = hc_bytealign (w1[1], w1[2], offset); + w1[1] = hc_bytealign (w1[0], w1[1], offset); + w1[0] = hc_bytealign (w0[3], w1[0], offset); + w0[3] = hc_bytealign (w0[2], w0[3], offset); + w0[2] = hc_bytealign (w0[1], w0[2], offset); + w0[1] = hc_bytealign (w0[0], w0[1], offset); + w0[0] = hc_bytealign ( 0, w0[0], offset); break; case 1: - c0[1] = hc_bytealign ( 0, w3[3], offset_minus_4); - c0[0] = hc_bytealign (w3[3], w3[2], offset_minus_4); - w3[3] = hc_bytealign (w3[2], w3[1], offset_minus_4); - w3[2] = hc_bytealign (w3[1], w3[0], offset_minus_4); - w3[1] = hc_bytealign (w3[0], w2[3], offset_minus_4); - w3[0] = hc_bytealign (w2[3], w2[2], offset_minus_4); - w2[3] = hc_bytealign (w2[2], w2[1], offset_minus_4); - w2[2] = hc_bytealign (w2[1], w2[0], offset_minus_4); - w2[1] = hc_bytealign (w2[0], w1[3], offset_minus_4); - w2[0] = hc_bytealign (w1[3], w1[2], offset_minus_4); - w1[3] = hc_bytealign (w1[2], w1[1], offset_minus_4); - w1[2] = hc_bytealign (w1[1], w1[0], offset_minus_4); - w1[1] = hc_bytealign (w1[0], w0[3], offset_minus_4); - w1[0] = hc_bytealign (w0[3], w0[2], offset_minus_4); - w0[3] = hc_bytealign (w0[2], w0[1], offset_minus_4); - w0[2] = hc_bytealign (w0[1], w0[0], offset_minus_4); - w0[1] = hc_bytealign (w0[0], 0, offset_minus_4); + c0[1] = hc_bytealign (w3[3], 0, offset); + c0[0] = hc_bytealign (w3[2], w3[3], offset); + w3[3] = hc_bytealign (w3[1], w3[2], offset); + w3[2] = hc_bytealign (w3[0], w3[1], offset); + w3[1] = hc_bytealign (w2[3], w3[0], offset); + w3[0] = hc_bytealign (w2[2], w2[3], offset); + w2[3] = hc_bytealign (w2[1], w2[2], offset); + w2[2] = hc_bytealign (w2[0], w2[1], offset); + w2[1] = hc_bytealign (w1[3], w2[0], offset); + w2[0] = hc_bytealign (w1[2], w1[3], offset); + w1[3] = hc_bytealign (w1[1], w1[2], offset); + w1[2] = hc_bytealign (w1[0], w1[1], offset); + w1[1] = hc_bytealign (w0[3], w1[0], offset); + w1[0] = hc_bytealign (w0[2], w0[3], offset); + w0[3] = hc_bytealign (w0[1], w0[2], offset); + w0[2] = hc_bytealign (w0[0], w0[1], offset); + w0[1] = hc_bytealign ( 0, w0[0], offset); w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = 0; - } - break; case 2: - c0[2] = hc_bytealign ( 0, w3[3], offset_minus_4); - c0[1] = hc_bytealign (w3[3], w3[2], offset_minus_4); - c0[0] = hc_bytealign (w3[2], w3[1], offset_minus_4); - w3[3] = hc_bytealign (w3[1], w3[0], offset_minus_4); - w3[2] = hc_bytealign (w3[0], w2[3], offset_minus_4); - w3[1] = hc_bytealign (w2[3], w2[2], offset_minus_4); - w3[0] = hc_bytealign (w2[2], w2[1], offset_minus_4); - w2[3] = hc_bytealign (w2[1], w2[0], offset_minus_4); - w2[2] = hc_bytealign (w2[0], w1[3], offset_minus_4); - w2[1] = hc_bytealign (w1[3], w1[2], offset_minus_4); - w2[0] = hc_bytealign (w1[2], w1[1], offset_minus_4); - w1[3] = hc_bytealign (w1[1], w1[0], offset_minus_4); - w1[2] = hc_bytealign (w1[0], w0[3], offset_minus_4); - w1[1] = hc_bytealign (w0[3], w0[2], offset_minus_4); - w1[0] = hc_bytealign (w0[2], w0[1], offset_minus_4); - w0[3] = hc_bytealign (w0[1], w0[0], offset_minus_4); - w0[2] = hc_bytealign (w0[0], 0, offset_minus_4); + c0[2] = hc_bytealign (w3[3], 0, offset); + c0[1] = hc_bytealign (w3[2], w3[3], offset); + c0[0] = hc_bytealign (w3[1], w3[2], offset); + w3[3] = hc_bytealign (w3[0], w3[1], offset); + w3[2] = hc_bytealign (w2[3], w3[0], offset); + w3[1] = hc_bytealign (w2[2], w2[3], offset); + w3[0] = hc_bytealign (w2[1], w2[2], offset); + w2[3] = hc_bytealign (w2[0], w2[1], offset); + w2[2] = hc_bytealign (w1[3], w2[0], offset); + w2[1] = hc_bytealign (w1[2], w1[3], offset); + w2[0] = hc_bytealign (w1[1], w1[2], offset); + w1[3] = hc_bytealign (w1[0], w1[1], offset); + w1[2] = hc_bytealign (w0[3], w1[0], offset); + w1[1] = hc_bytealign (w0[2], w0[3], offset); + w1[0] = hc_bytealign (w0[1], w0[2], offset); + w0[3] = hc_bytealign (w0[0], w0[1], offset); + w0[2] = hc_bytealign ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = 0; - } - break; case 3: - c0[3] = hc_bytealign ( 0, w3[3], offset_minus_4); - c0[2] = hc_bytealign (w3[3], w3[2], offset_minus_4); - c0[1] = hc_bytealign (w3[2], w3[1], offset_minus_4); - c0[0] = hc_bytealign (w3[1], w3[0], offset_minus_4); - w3[3] = hc_bytealign (w3[0], w2[3], offset_minus_4); - w3[2] = hc_bytealign (w2[3], w2[2], offset_minus_4); - w3[1] = hc_bytealign (w2[2], w2[1], offset_minus_4); - w3[0] = hc_bytealign (w2[1], w2[0], offset_minus_4); - w2[3] = hc_bytealign (w2[0], w1[3], offset_minus_4); - w2[2] = hc_bytealign (w1[3], w1[2], offset_minus_4); - w2[1] = hc_bytealign (w1[2], w1[1], offset_minus_4); - w2[0] = hc_bytealign (w1[1], w1[0], offset_minus_4); - w1[3] = hc_bytealign (w1[0], w0[3], offset_minus_4); - w1[2] = hc_bytealign (w0[3], w0[2], offset_minus_4); - w1[1] = hc_bytealign (w0[2], w0[1], offset_minus_4); - w1[0] = hc_bytealign (w0[1], w0[0], offset_minus_4); - w0[3] = hc_bytealign (w0[0], 0, offset_minus_4); + c0[3] = hc_bytealign (w3[3], 0, offset); + c0[2] = hc_bytealign (w3[2], w3[3], offset); + c0[1] = hc_bytealign (w3[1], w3[2], offset); + c0[0] = hc_bytealign (w3[0], w3[1], offset); + w3[3] = hc_bytealign (w2[3], w3[0], offset); + w3[2] = hc_bytealign (w2[2], w2[3], offset); + w3[1] = hc_bytealign (w2[1], w2[2], offset); + w3[0] = hc_bytealign (w2[0], w2[1], offset); + w2[3] = hc_bytealign (w1[3], w2[0], offset); + w2[2] = hc_bytealign (w1[2], w1[3], offset); + w2[1] = hc_bytealign (w1[1], w1[2], offset); + w2[0] = hc_bytealign (w1[0], w1[1], offset); + w1[3] = hc_bytealign (w0[3], w1[0], offset); + w1[2] = hc_bytealign (w0[2], w0[3], offset); + w1[1] = hc_bytealign (w0[1], w0[2], offset); + w1[0] = hc_bytealign (w0[0], w0[1], offset); + w0[3] = hc_bytealign ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = 0; - } - break; case 4: - c1[0] = hc_bytealign ( 0, w3[3], offset_minus_4); - c0[3] = hc_bytealign (w3[3], w3[2], offset_minus_4); - c0[2] = hc_bytealign (w3[2], w3[1], offset_minus_4); - c0[1] = hc_bytealign (w3[1], w3[0], offset_minus_4); - c0[0] = hc_bytealign (w3[0], w2[3], offset_minus_4); - w3[3] = hc_bytealign (w2[3], w2[2], offset_minus_4); - w3[2] = hc_bytealign (w2[2], w2[1], offset_minus_4); - w3[1] = hc_bytealign (w2[1], w2[0], offset_minus_4); - w3[0] = hc_bytealign (w2[0], w1[3], offset_minus_4); - w2[3] = hc_bytealign (w1[3], w1[2], offset_minus_4); - w2[2] = hc_bytealign (w1[2], w1[1], offset_minus_4); - w2[1] = hc_bytealign (w1[1], w1[0], offset_minus_4); - w2[0] = hc_bytealign (w1[0], w0[3], offset_minus_4); - w1[3] = hc_bytealign (w0[3], w0[2], offset_minus_4); - w1[2] = hc_bytealign (w0[2], w0[1], offset_minus_4); - w1[1] = hc_bytealign (w0[1], w0[0], offset_minus_4); - w1[0] = hc_bytealign (w0[0], 0, offset_minus_4); + c1[0] = hc_bytealign (w3[3], 0, offset); + c0[3] = hc_bytealign (w3[2], w3[3], offset); + c0[2] = hc_bytealign (w3[1], w3[2], offset); + c0[1] = hc_bytealign (w3[0], w3[1], offset); + c0[0] = hc_bytealign (w2[3], w3[0], offset); + w3[3] = hc_bytealign (w2[2], w2[3], offset); + w3[2] = hc_bytealign (w2[1], w2[2], offset); + w3[1] = hc_bytealign (w2[0], w2[1], offset); + w3[0] = hc_bytealign (w1[3], w2[0], offset); + w2[3] = hc_bytealign (w1[2], w1[3], offset); + w2[2] = hc_bytealign (w1[1], w1[2], offset); + w2[1] = hc_bytealign (w1[0], w1[1], offset); + w2[0] = hc_bytealign (w0[3], w1[0], offset); + w1[3] = hc_bytealign (w0[2], w0[3], offset); + w1[2] = hc_bytealign (w0[1], w0[2], offset); + w1[1] = hc_bytealign (w0[0], w0[1], offset); + w1[0] = hc_bytealign ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = 0; - } - break; case 5: - c1[1] = hc_bytealign ( 0, w3[3], offset_minus_4); - c1[0] = hc_bytealign (w3[3], w3[2], offset_minus_4); - c0[3] = hc_bytealign (w3[2], w3[1], offset_minus_4); - c0[2] = hc_bytealign (w3[1], w3[0], offset_minus_4); - c0[1] = hc_bytealign (w3[0], w2[3], offset_minus_4); - c0[0] = hc_bytealign (w2[3], w2[2], offset_minus_4); - w3[3] = hc_bytealign (w2[2], w2[1], offset_minus_4); - w3[2] = hc_bytealign (w2[1], w2[0], offset_minus_4); - w3[1] = hc_bytealign (w2[0], w1[3], offset_minus_4); - w3[0] = hc_bytealign (w1[3], w1[2], offset_minus_4); - w2[3] = hc_bytealign (w1[2], w1[1], offset_minus_4); - w2[2] = hc_bytealign (w1[1], w1[0], offset_minus_4); - w2[1] = hc_bytealign (w1[0], w0[3], offset_minus_4); - w2[0] = hc_bytealign (w0[3], w0[2], offset_minus_4); - w1[3] = hc_bytealign (w0[2], w0[1], offset_minus_4); - w1[2] = hc_bytealign (w0[1], w0[0], offset_minus_4); - w1[1] = hc_bytealign (w0[0], 0, offset_minus_4); + c1[1] = hc_bytealign (w3[3], 0, offset); + c1[0] = hc_bytealign (w3[2], w3[3], offset); + c0[3] = hc_bytealign (w3[1], w3[2], offset); + c0[2] = hc_bytealign (w3[0], w3[1], offset); + c0[1] = hc_bytealign (w2[3], w3[0], offset); + c0[0] = hc_bytealign (w2[2], w2[3], offset); + w3[3] = hc_bytealign (w2[1], w2[2], offset); + w3[2] = hc_bytealign (w2[0], w2[1], offset); + w3[1] = hc_bytealign (w1[3], w2[0], offset); + w3[0] = hc_bytealign (w1[2], w1[3], offset); + w2[3] = hc_bytealign (w1[1], w1[2], offset); + w2[2] = hc_bytealign (w1[0], w1[1], offset); + w2[1] = hc_bytealign (w0[3], w1[0], offset); + w2[0] = hc_bytealign (w0[2], w0[3], offset); + w1[3] = hc_bytealign (w0[1], w0[2], offset); + w1[2] = hc_bytealign (w0[0], w0[1], offset); + w1[1] = hc_bytealign ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = 0; - } - break; case 6: - c1[2] = hc_bytealign ( 0, w3[3], offset_minus_4); - c1[1] = hc_bytealign (w3[3], w3[2], offset_minus_4); - c1[0] = hc_bytealign (w3[2], w3[1], offset_minus_4); - c0[3] = hc_bytealign (w3[1], w3[0], offset_minus_4); - c0[2] = hc_bytealign (w3[0], w2[3], offset_minus_4); - c0[1] = hc_bytealign (w2[3], w2[2], offset_minus_4); - c0[0] = hc_bytealign (w2[2], w2[1], offset_minus_4); - w3[3] = hc_bytealign (w2[1], w2[0], offset_minus_4); - w3[2] = hc_bytealign (w2[0], w1[3], offset_minus_4); - w3[1] = hc_bytealign (w1[3], w1[2], offset_minus_4); - w3[0] = hc_bytealign (w1[2], w1[1], offset_minus_4); - w2[3] = hc_bytealign (w1[1], w1[0], offset_minus_4); - w2[2] = hc_bytealign (w1[0], w0[3], offset_minus_4); - w2[1] = hc_bytealign (w0[3], w0[2], offset_minus_4); - w2[0] = hc_bytealign (w0[2], w0[1], offset_minus_4); - w1[3] = hc_bytealign (w0[1], w0[0], offset_minus_4); - w1[2] = hc_bytealign (w0[0], 0, offset_minus_4); + c1[2] = hc_bytealign (w3[3], 0, offset); + c1[1] = hc_bytealign (w3[2], w3[3], offset); + c1[0] = hc_bytealign (w3[1], w3[2], offset); + c0[3] = hc_bytealign (w3[0], w3[1], offset); + c0[2] = hc_bytealign (w2[3], w3[0], offset); + c0[1] = hc_bytealign (w2[2], w2[3], offset); + c0[0] = hc_bytealign (w2[1], w2[2], offset); + w3[3] = hc_bytealign (w2[0], w2[1], offset); + w3[2] = hc_bytealign (w1[3], w2[0], offset); + w3[1] = hc_bytealign (w1[2], w1[3], offset); + w3[0] = hc_bytealign (w1[1], w1[2], offset); + w2[3] = hc_bytealign (w1[0], w1[1], offset); + w2[2] = hc_bytealign (w0[3], w1[0], offset); + w2[1] = hc_bytealign (w0[2], w0[3], offset); + w2[0] = hc_bytealign (w0[1], w0[2], offset); + w1[3] = hc_bytealign (w0[0], w0[1], offset); + w1[2] = hc_bytealign ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -3506,47 +3383,26 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = 0; - } - break; case 7: - c1[3] = hc_bytealign ( 0, w3[3], offset_minus_4); - c1[2] = hc_bytealign (w3[3], w3[2], offset_minus_4); - c1[1] = hc_bytealign (w3[2], w3[1], offset_minus_4); - c1[0] = hc_bytealign (w3[1], w3[0], offset_minus_4); - c0[3] = hc_bytealign (w3[0], w2[3], offset_minus_4); - c0[2] = hc_bytealign (w2[3], w2[2], offset_minus_4); - c0[1] = hc_bytealign (w2[2], w2[1], offset_minus_4); - c0[0] = hc_bytealign (w2[1], w2[0], offset_minus_4); - w3[3] = hc_bytealign (w2[0], w1[3], offset_minus_4); - w3[2] = hc_bytealign (w1[3], w1[2], offset_minus_4); - w3[1] = hc_bytealign (w1[2], w1[1], offset_minus_4); - w3[0] = hc_bytealign (w1[1], w1[0], offset_minus_4); - w2[3] = hc_bytealign (w1[0], w0[3], offset_minus_4); - w2[2] = hc_bytealign (w0[3], w0[2], offset_minus_4); - w2[1] = hc_bytealign (w0[2], w0[1], offset_minus_4); - w2[0] = hc_bytealign (w0[1], w0[0], offset_minus_4); - w1[3] = hc_bytealign (w0[0], 0, offset_minus_4); + c1[3] = hc_bytealign (w3[3], 0, offset); + c1[2] = hc_bytealign (w3[2], w3[3], offset); + c1[1] = hc_bytealign (w3[1], w3[2], offset); + c1[0] = hc_bytealign (w3[0], w3[1], offset); + c0[3] = hc_bytealign (w2[3], w3[0], offset); + c0[2] = hc_bytealign (w2[2], w2[3], offset); + c0[1] = hc_bytealign (w2[1], w2[2], offset); + c0[0] = hc_bytealign (w2[0], w2[1], offset); + w3[3] = hc_bytealign (w1[3], w2[0], offset); + w3[2] = hc_bytealign (w1[2], w1[3], offset); + w3[1] = hc_bytealign (w1[1], w1[2], offset); + w3[0] = hc_bytealign (w1[0], w1[1], offset); + w2[3] = hc_bytealign (w0[3], w1[0], offset); + w2[2] = hc_bytealign (w0[2], w0[3], offset); + w2[1] = hc_bytealign (w0[1], w0[2], offset); + w2[0] = hc_bytealign (w0[0], w0[1], offset); + w1[3] = hc_bytealign ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -3555,47 +3411,26 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = c1[3]; - c1[3] = 0; - } - break; case 8: - c2[0] = hc_bytealign ( 0, w3[3], offset_minus_4); - c1[3] = hc_bytealign (w3[3], w3[2], offset_minus_4); - c1[2] = hc_bytealign (w3[2], w3[1], offset_minus_4); - c1[1] = hc_bytealign (w3[1], w3[0], offset_minus_4); - c1[0] = hc_bytealign (w3[0], w2[3], offset_minus_4); - c0[3] = hc_bytealign (w2[3], w2[2], offset_minus_4); - c0[2] = hc_bytealign (w2[2], w2[1], offset_minus_4); - c0[1] = hc_bytealign (w2[1], w2[0], offset_minus_4); - c0[0] = hc_bytealign (w2[0], w1[3], offset_minus_4); - w3[3] = hc_bytealign (w1[3], w1[2], offset_minus_4); - w3[2] = hc_bytealign (w1[2], w1[1], offset_minus_4); - w3[1] = hc_bytealign (w1[1], w1[0], offset_minus_4); - w3[0] = hc_bytealign (w1[0], w0[3], offset_minus_4); - w2[3] = hc_bytealign (w0[3], w0[2], offset_minus_4); - w2[2] = hc_bytealign (w0[2], w0[1], offset_minus_4); - w2[1] = hc_bytealign (w0[1], w0[0], offset_minus_4); - w2[0] = hc_bytealign (w0[0], 0, offset_minus_4); + c2[0] = hc_bytealign (w3[3], 0, offset); + c1[3] = hc_bytealign (w3[2], w3[3], offset); + c1[2] = hc_bytealign (w3[1], w3[2], offset); + c1[1] = hc_bytealign (w3[0], w3[1], offset); + c1[0] = hc_bytealign (w2[3], w3[0], offset); + c0[3] = hc_bytealign (w2[2], w2[3], offset); + c0[2] = hc_bytealign (w2[1], w2[2], offset); + c0[1] = hc_bytealign (w2[0], w2[1], offset); + c0[0] = hc_bytealign (w1[3], w2[0], offset); + w3[3] = hc_bytealign (w1[2], w1[3], offset); + w3[2] = hc_bytealign (w1[1], w1[2], offset); + w3[1] = hc_bytealign (w1[0], w1[1], offset); + w3[0] = hc_bytealign (w0[3], w1[0], offset); + w2[3] = hc_bytealign (w0[2], w0[3], offset); + w2[2] = hc_bytealign (w0[1], w0[2], offset); + w2[1] = hc_bytealign (w0[0], w0[1], offset); + w2[0] = hc_bytealign ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -3605,47 +3440,26 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = c1[3]; - c1[3] = c2[0]; - c2[0] = 0; - } - break; case 9: - c2[1] = hc_bytealign ( 0, w3[3], offset_minus_4); - c2[0] = hc_bytealign (w3[3], w3[2], offset_minus_4); - c1[3] = hc_bytealign (w3[2], w3[1], offset_minus_4); - c1[2] = hc_bytealign (w3[1], w3[0], offset_minus_4); - c1[1] = hc_bytealign (w3[0], w2[3], offset_minus_4); - c1[0] = hc_bytealign (w2[3], w2[2], offset_minus_4); - c0[3] = hc_bytealign (w2[2], w2[1], offset_minus_4); - c0[2] = hc_bytealign (w2[1], w2[0], offset_minus_4); - c0[1] = hc_bytealign (w2[0], w1[3], offset_minus_4); - c0[0] = hc_bytealign (w1[3], w1[2], offset_minus_4); - w3[3] = hc_bytealign (w1[2], w1[1], offset_minus_4); - w3[2] = hc_bytealign (w1[1], w1[0], offset_minus_4); - w3[1] = hc_bytealign (w1[0], w0[3], offset_minus_4); - w3[0] = hc_bytealign (w0[3], w0[2], offset_minus_4); - w2[3] = hc_bytealign (w0[2], w0[1], offset_minus_4); - w2[2] = hc_bytealign (w0[1], w0[0], offset_minus_4); - w2[1] = hc_bytealign (w0[0], 0, offset_minus_4); + c2[1] = hc_bytealign (w3[3], 0, offset); + c2[0] = hc_bytealign (w3[2], w3[3], offset); + c1[3] = hc_bytealign (w3[1], w3[2], offset); + c1[2] = hc_bytealign (w3[0], w3[1], offset); + c1[1] = hc_bytealign (w2[3], w3[0], offset); + c1[0] = hc_bytealign (w2[2], w2[3], offset); + c0[3] = hc_bytealign (w2[1], w2[2], offset); + c0[2] = hc_bytealign (w2[0], w2[1], offset); + c0[1] = hc_bytealign (w1[3], w2[0], offset); + c0[0] = hc_bytealign (w1[2], w1[3], offset); + w3[3] = hc_bytealign (w1[1], w1[2], offset); + w3[2] = hc_bytealign (w1[0], w1[1], offset); + w3[1] = hc_bytealign (w0[3], w1[0], offset); + w3[0] = hc_bytealign (w0[2], w0[3], offset); + w2[3] = hc_bytealign (w0[1], w0[2], offset); + w2[2] = hc_bytealign (w0[0], w0[1], offset); + w2[1] = hc_bytealign ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -3656,47 +3470,26 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = c1[3]; - c1[3] = c2[0]; - c2[0] = c2[1]; - c2[1] = 0; - } - break; case 10: - c2[2] = hc_bytealign ( 0, w3[3], offset_minus_4); - c2[1] = hc_bytealign (w3[3], w3[2], offset_minus_4); - c2[0] = hc_bytealign (w3[2], w3[1], offset_minus_4); - c1[3] = hc_bytealign (w3[1], w3[0], offset_minus_4); - c1[2] = hc_bytealign (w3[0], w2[3], offset_minus_4); - c1[1] = hc_bytealign (w2[3], w2[2], offset_minus_4); - c1[0] = hc_bytealign (w2[2], w2[1], offset_minus_4); - c0[3] = hc_bytealign (w2[1], w2[0], offset_minus_4); - c0[2] = hc_bytealign (w2[0], w1[3], offset_minus_4); - c0[1] = hc_bytealign (w1[3], w1[2], offset_minus_4); - c0[0] = hc_bytealign (w1[2], w1[1], offset_minus_4); - w3[3] = hc_bytealign (w1[1], w1[0], offset_minus_4); - w3[2] = hc_bytealign (w1[0], w0[3], offset_minus_4); - w3[1] = hc_bytealign (w0[3], w0[2], offset_minus_4); - w3[0] = hc_bytealign (w0[2], w0[1], offset_minus_4); - w2[3] = hc_bytealign (w0[1], w0[0], offset_minus_4); - w2[2] = hc_bytealign (w0[0], 0, offset_minus_4); + c2[2] = hc_bytealign (w3[3], 0, offset); + c2[1] = hc_bytealign (w3[2], w3[3], offset); + c2[0] = hc_bytealign (w3[1], w3[2], offset); + c1[3] = hc_bytealign (w3[0], w3[1], offset); + c1[2] = hc_bytealign (w2[3], w3[0], offset); + c1[1] = hc_bytealign (w2[2], w2[3], offset); + c1[0] = hc_bytealign (w2[1], w2[2], offset); + c0[3] = hc_bytealign (w2[0], w2[1], offset); + c0[2] = hc_bytealign (w1[3], w2[0], offset); + c0[1] = hc_bytealign (w1[2], w1[3], offset); + c0[0] = hc_bytealign (w1[1], w1[2], offset); + w3[3] = hc_bytealign (w1[0], w1[1], offset); + w3[2] = hc_bytealign (w0[3], w1[0], offset); + w3[1] = hc_bytealign (w0[2], w0[3], offset); + w3[0] = hc_bytealign (w0[1], w0[2], offset); + w2[3] = hc_bytealign (w0[0], w0[1], offset); + w2[2] = hc_bytealign ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -3708,47 +3501,26 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = c1[3]; - c1[3] = c2[0]; - c2[0] = c2[1]; - c2[1] = c2[2]; - c2[2] = 0; - } - break; case 11: - c2[3] = hc_bytealign ( 0, w3[3], offset_minus_4); - c2[2] = hc_bytealign (w3[3], w3[2], offset_minus_4); - c2[1] = hc_bytealign (w3[2], w3[1], offset_minus_4); - c2[0] = hc_bytealign (w3[1], w3[0], offset_minus_4); - c1[3] = hc_bytealign (w3[0], w2[3], offset_minus_4); - c1[2] = hc_bytealign (w2[3], w2[2], offset_minus_4); - c1[1] = hc_bytealign (w2[2], w2[1], offset_minus_4); - c1[0] = hc_bytealign (w2[1], w2[0], offset_minus_4); - c0[3] = hc_bytealign (w2[0], w1[3], offset_minus_4); - c0[2] = hc_bytealign (w1[3], w1[2], offset_minus_4); - c0[1] = hc_bytealign (w1[2], w1[1], offset_minus_4); - c0[0] = hc_bytealign (w1[1], w1[0], offset_minus_4); - w3[3] = hc_bytealign (w1[0], w0[3], offset_minus_4); - w3[2] = hc_bytealign (w0[3], w0[2], offset_minus_4); - w3[1] = hc_bytealign (w0[2], w0[1], offset_minus_4); - w3[0] = hc_bytealign (w0[1], w0[0], offset_minus_4); - w2[3] = hc_bytealign (w0[0], 0, offset_minus_4); + c2[3] = hc_bytealign (w3[3], 0, offset); + c2[2] = hc_bytealign (w3[2], w3[3], offset); + c2[1] = hc_bytealign (w3[1], w3[2], offset); + c2[0] = hc_bytealign (w3[0], w3[1], offset); + c1[3] = hc_bytealign (w2[3], w3[0], offset); + c1[2] = hc_bytealign (w2[2], w2[3], offset); + c1[1] = hc_bytealign (w2[1], w2[2], offset); + c1[0] = hc_bytealign (w2[0], w2[1], offset); + c0[3] = hc_bytealign (w1[3], w2[0], offset); + c0[2] = hc_bytealign (w1[2], w1[3], offset); + c0[1] = hc_bytealign (w1[1], w1[2], offset); + c0[0] = hc_bytealign (w1[0], w1[1], offset); + w3[3] = hc_bytealign (w0[3], w1[0], offset); + w3[2] = hc_bytealign (w0[2], w0[3], offset); + w3[1] = hc_bytealign (w0[1], w0[2], offset); + w3[0] = hc_bytealign (w0[0], w0[1], offset); + w2[3] = hc_bytealign ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -3761,47 +3533,26 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = c1[3]; - c1[3] = c2[0]; - c2[0] = c2[1]; - c2[1] = c2[2]; - c2[2] = c2[3]; - c2[3] = 0; - } - break; case 12: - c3[0] = hc_bytealign ( 0, w3[3], offset_minus_4); - c2[3] = hc_bytealign (w3[3], w3[2], offset_minus_4); - c2[2] = hc_bytealign (w3[2], w3[1], offset_minus_4); - c2[1] = hc_bytealign (w3[1], w3[0], offset_minus_4); - c2[0] = hc_bytealign (w3[0], w2[3], offset_minus_4); - c1[3] = hc_bytealign (w2[3], w2[2], offset_minus_4); - c1[2] = hc_bytealign (w2[2], w2[1], offset_minus_4); - c1[1] = hc_bytealign (w2[1], w2[0], offset_minus_4); - c1[0] = hc_bytealign (w2[0], w1[3], offset_minus_4); - c0[3] = hc_bytealign (w1[3], w1[2], offset_minus_4); - c0[2] = hc_bytealign (w1[2], w1[1], offset_minus_4); - c0[1] = hc_bytealign (w1[1], w1[0], offset_minus_4); - c0[0] = hc_bytealign (w1[0], w0[3], offset_minus_4); - w3[3] = hc_bytealign (w0[3], w0[2], offset_minus_4); - w3[2] = hc_bytealign (w0[2], w0[1], offset_minus_4); - w3[1] = hc_bytealign (w0[1], w0[0], offset_minus_4); - w3[0] = hc_bytealign (w0[0], 0, offset_minus_4); + c3[0] = hc_bytealign (w3[3], 0, offset); + c2[3] = hc_bytealign (w3[2], w3[3], offset); + c2[2] = hc_bytealign (w3[1], w3[2], offset); + c2[1] = hc_bytealign (w3[0], w3[1], offset); + c2[0] = hc_bytealign (w2[3], w3[0], offset); + c1[3] = hc_bytealign (w2[2], w2[3], offset); + c1[2] = hc_bytealign (w2[1], w2[2], offset); + c1[1] = hc_bytealign (w2[0], w2[1], offset); + c1[0] = hc_bytealign (w1[3], w2[0], offset); + c0[3] = hc_bytealign (w1[2], w1[3], offset); + c0[2] = hc_bytealign (w1[1], w1[2], offset); + c0[1] = hc_bytealign (w1[0], w1[1], offset); + c0[0] = hc_bytealign (w0[3], w1[0], offset); + w3[3] = hc_bytealign (w0[2], w0[3], offset); + w3[2] = hc_bytealign (w0[1], w0[2], offset); + w3[1] = hc_bytealign (w0[0], w0[1], offset); + w3[0] = hc_bytealign ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -3815,47 +3566,26 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = c1[3]; - c1[3] = c2[0]; - c2[0] = c2[1]; - c2[1] = c2[2]; - c2[2] = c2[3]; - c2[3] = c3[0]; - c3[0] = 0; - } - break; case 13: - c3[1] = hc_bytealign ( 0, w3[3], offset_minus_4); - c3[0] = hc_bytealign (w3[3], w3[2], offset_minus_4); - c2[3] = hc_bytealign (w3[2], w3[1], offset_minus_4); - c2[2] = hc_bytealign (w3[1], w3[0], offset_minus_4); - c2[1] = hc_bytealign (w3[0], w2[3], offset_minus_4); - c2[0] = hc_bytealign (w2[3], w2[2], offset_minus_4); - c1[3] = hc_bytealign (w2[2], w2[1], offset_minus_4); - c1[2] = hc_bytealign (w2[1], w2[0], offset_minus_4); - c1[1] = hc_bytealign (w2[0], w1[3], offset_minus_4); - c1[0] = hc_bytealign (w1[3], w1[2], offset_minus_4); - c0[3] = hc_bytealign (w1[2], w1[1], offset_minus_4); - c0[2] = hc_bytealign (w1[1], w1[0], offset_minus_4); - c0[1] = hc_bytealign (w1[0], w0[3], offset_minus_4); - c0[0] = hc_bytealign (w0[3], w0[2], offset_minus_4); - w3[3] = hc_bytealign (w0[2], w0[1], offset_minus_4); - w3[2] = hc_bytealign (w0[1], w0[0], offset_minus_4); - w3[1] = hc_bytealign (w0[0], 0, offset_minus_4); + c3[1] = hc_bytealign (w3[3], 0, offset); + c3[0] = hc_bytealign (w3[2], w3[3], offset); + c2[3] = hc_bytealign (w3[1], w3[2], offset); + c2[2] = hc_bytealign (w3[0], w3[1], offset); + c2[1] = hc_bytealign (w2[3], w3[0], offset); + c2[0] = hc_bytealign (w2[2], w2[3], offset); + c1[3] = hc_bytealign (w2[1], w2[2], offset); + c1[2] = hc_bytealign (w2[0], w2[1], offset); + c1[1] = hc_bytealign (w1[3], w2[0], offset); + c1[0] = hc_bytealign (w1[2], w1[3], offset); + c0[3] = hc_bytealign (w1[1], w1[2], offset); + c0[2] = hc_bytealign (w1[0], w1[1], offset); + c0[1] = hc_bytealign (w0[3], w1[0], offset); + c0[0] = hc_bytealign (w0[2], w0[3], offset); + w3[3] = hc_bytealign (w0[1], w0[2], offset); + w3[2] = hc_bytealign (w0[0], w0[1], offset); + w3[1] = hc_bytealign ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -3870,47 +3600,26 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = c1[3]; - c1[3] = c2[0]; - c2[0] = c2[1]; - c2[1] = c2[2]; - c2[2] = c2[3]; - c2[3] = c3[0]; - c3[0] = c3[1]; - c3[1] = 0; - } - break; case 14: - c3[2] = hc_bytealign ( 0, w3[3], offset_minus_4); - c3[1] = hc_bytealign (w3[3], w3[2], offset_minus_4); - c3[0] = hc_bytealign (w3[2], w3[1], offset_minus_4); - c2[3] = hc_bytealign (w3[1], w3[0], offset_minus_4); - c2[2] = hc_bytealign (w3[0], w2[3], offset_minus_4); - c2[1] = hc_bytealign (w2[3], w2[2], offset_minus_4); - c2[0] = hc_bytealign (w2[2], w2[1], offset_minus_4); - c1[3] = hc_bytealign (w2[1], w2[0], offset_minus_4); - c1[2] = hc_bytealign (w2[0], w1[3], offset_minus_4); - c1[1] = hc_bytealign (w1[3], w1[2], offset_minus_4); - c1[0] = hc_bytealign (w1[2], w1[1], offset_minus_4); - c0[3] = hc_bytealign (w1[1], w1[0], offset_minus_4); - c0[2] = hc_bytealign (w1[0], w0[3], offset_minus_4); - c0[1] = hc_bytealign (w0[3], w0[2], offset_minus_4); - c0[0] = hc_bytealign (w0[2], w0[1], offset_minus_4); - w3[3] = hc_bytealign (w0[1], w0[0], offset_minus_4); - w3[2] = hc_bytealign (w0[0], 0, offset_minus_4); + c3[2] = hc_bytealign (w3[3], 0, offset); + c3[1] = hc_bytealign (w3[2], w3[3], offset); + c3[0] = hc_bytealign (w3[1], w3[2], offset); + c2[3] = hc_bytealign (w3[0], w3[1], offset); + c2[2] = hc_bytealign (w2[3], w3[0], offset); + c2[1] = hc_bytealign (w2[2], w2[3], offset); + c2[0] = hc_bytealign (w2[1], w2[2], offset); + c1[3] = hc_bytealign (w2[0], w2[1], offset); + c1[2] = hc_bytealign (w1[3], w2[0], offset); + c1[1] = hc_bytealign (w1[2], w1[3], offset); + c1[0] = hc_bytealign (w1[1], w1[2], offset); + c0[3] = hc_bytealign (w1[0], w1[1], offset); + c0[2] = hc_bytealign (w0[3], w1[0], offset); + c0[1] = hc_bytealign (w0[2], w0[3], offset); + c0[0] = hc_bytealign (w0[1], w0[2], offset); + w3[3] = hc_bytealign (w0[0], w0[1], offset); + w3[2] = hc_bytealign ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -3926,47 +3635,26 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = c1[3]; - c1[3] = c2[0]; - c2[0] = c2[1]; - c2[1] = c2[2]; - c2[2] = c2[3]; - c2[3] = c3[0]; - c3[0] = c3[1]; - c3[1] = c3[2]; - c3[2] = 0; - } - break; case 15: - c3[3] = hc_bytealign ( 0, w3[3], offset_minus_4); - c3[2] = hc_bytealign (w3[3], w3[2], offset_minus_4); - c3[1] = hc_bytealign (w3[2], w3[1], offset_minus_4); - c3[0] = hc_bytealign (w3[1], w3[0], offset_minus_4); - c2[3] = hc_bytealign (w3[0], w2[3], offset_minus_4); - c2[2] = hc_bytealign (w2[3], w2[2], offset_minus_4); - c2[1] = hc_bytealign (w2[2], w2[1], offset_minus_4); - c2[0] = hc_bytealign (w2[1], w2[0], offset_minus_4); - c1[3] = hc_bytealign (w2[0], w1[3], offset_minus_4); - c1[2] = hc_bytealign (w1[3], w1[2], offset_minus_4); - c1[1] = hc_bytealign (w1[2], w1[1], offset_minus_4); - c1[0] = hc_bytealign (w1[1], w1[0], offset_minus_4); - c0[3] = hc_bytealign (w1[0], w0[3], offset_minus_4); - c0[2] = hc_bytealign (w0[3], w0[2], offset_minus_4); - c0[1] = hc_bytealign (w0[2], w0[1], offset_minus_4); - c0[0] = hc_bytealign (w0[1], w0[0], offset_minus_4); - w3[3] = hc_bytealign (w0[0], 0, offset_minus_4); + c3[3] = hc_bytealign (w3[3], 0, offset); + c3[2] = hc_bytealign (w3[2], w3[3], offset); + c3[1] = hc_bytealign (w3[1], w3[2], offset); + c3[0] = hc_bytealign (w3[0], w3[1], offset); + c2[3] = hc_bytealign (w2[3], w3[0], offset); + c2[2] = hc_bytealign (w2[2], w2[3], offset); + c2[1] = hc_bytealign (w2[1], w2[2], offset); + c2[0] = hc_bytealign (w2[0], w2[1], offset); + c1[3] = hc_bytealign (w1[3], w2[0], offset); + c1[2] = hc_bytealign (w1[2], w1[3], offset); + c1[1] = hc_bytealign (w1[1], w1[2], offset); + c1[0] = hc_bytealign (w1[0], w1[1], offset); + c0[3] = hc_bytealign (w0[3], w1[0], offset); + c0[2] = hc_bytealign (w0[2], w0[3], offset); + c0[1] = hc_bytealign (w0[1], w0[2], offset); + c0[0] = hc_bytealign (w0[0], w0[1], offset); + w3[3] = hc_bytealign ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -3983,27 +3671,6 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = c1[3]; - c1[3] = c2[0]; - c2[0] = c2[1]; - c2[1] = c2[2]; - c2[2] = c2[3]; - c2[3] = c3[0]; - c3[0] = c3[1]; - c3[1] = c3[2]; - c3[2] = c3[3]; - c3[3] = 0; - } - break; } #endif @@ -33750,299 +33417,168 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 #endif #ifdef IS_NV - - const int offset_mod_4 = offset & 3; - - const int offset_minus_4 = 4 - offset_mod_4; - - // todo + // could be improved, too switch (offset_switch) { case 0: - c0[0] = hc_bytealign_S ( 0, w3[3], offset_minus_4); - w3[3] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); - w3[2] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[1] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[0] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); - w2[3] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[2] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[1] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[0] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); - w1[3] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[2] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[1] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[0] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); - w0[3] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[2] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[1] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[0] = hc_bytealign_S (w0[0], 0, offset_minus_4); - - if (offset_mod_4 == 0) - { - w0[0] = w0[1]; - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = 0; - } + c0[0] = hc_bytealign_S (w3[3], 0, offset); + w3[3] = hc_bytealign_S (w3[2], w3[3], offset); + w3[2] = hc_bytealign_S (w3[1], w3[2], offset); + w3[1] = hc_bytealign_S (w3[0], w3[1], offset); + w3[0] = hc_bytealign_S (w2[3], w3[0], offset); + w2[3] = hc_bytealign_S (w2[2], w2[3], offset); + w2[2] = hc_bytealign_S (w2[1], w2[2], offset); + w2[1] = hc_bytealign_S (w2[0], w2[1], offset); + w2[0] = hc_bytealign_S (w1[3], w2[0], offset); + w1[3] = hc_bytealign_S (w1[2], w1[3], offset); + w1[2] = hc_bytealign_S (w1[1], w1[2], offset); + w1[1] = hc_bytealign_S (w1[0], w1[1], offset); + w1[0] = hc_bytealign_S (w0[3], w1[0], offset); + w0[3] = hc_bytealign_S (w0[2], w0[3], offset); + w0[2] = hc_bytealign_S (w0[1], w0[2], offset); + w0[1] = hc_bytealign_S (w0[0], w0[1], offset); + w0[0] = hc_bytealign_S ( 0, w0[0], offset); break; case 1: - c0[1] = hc_bytealign_S ( 0, w3[3], offset_minus_4); - c0[0] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); - w3[3] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[2] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[1] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[0] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[3] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[2] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[1] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[0] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[3] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[2] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[1] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[0] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[3] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[2] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[1] = hc_bytealign_S (w0[0], 0, offset_minus_4); + c0[1] = hc_bytealign_S (w3[3], 0, offset); + c0[0] = hc_bytealign_S (w3[2], w3[3], offset); + w3[3] = hc_bytealign_S (w3[1], w3[2], offset); + w3[2] = hc_bytealign_S (w3[0], w3[1], offset); + w3[1] = hc_bytealign_S (w2[3], w3[0], offset); + w3[0] = hc_bytealign_S (w2[2], w2[3], offset); + w2[3] = hc_bytealign_S (w2[1], w2[2], offset); + w2[2] = hc_bytealign_S (w2[0], w2[1], offset); + w2[1] = hc_bytealign_S (w1[3], w2[0], offset); + w2[0] = hc_bytealign_S (w1[2], w1[3], offset); + w1[3] = hc_bytealign_S (w1[1], w1[2], offset); + w1[2] = hc_bytealign_S (w1[0], w1[1], offset); + w1[1] = hc_bytealign_S (w0[3], w1[0], offset); + w1[0] = hc_bytealign_S (w0[2], w0[3], offset); + w0[3] = hc_bytealign_S (w0[1], w0[2], offset); + w0[2] = hc_bytealign_S (w0[0], w0[1], offset); + w0[1] = hc_bytealign_S ( 0, w0[0], offset); w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = 0; - } - break; case 2: - c0[2] = hc_bytealign_S ( 0, w3[3], offset_minus_4); - c0[1] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); - c0[0] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[3] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[2] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[1] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[0] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[3] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[2] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[1] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[0] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[3] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[2] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[1] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[0] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[3] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[2] = hc_bytealign_S (w0[0], 0, offset_minus_4); + c0[2] = hc_bytealign_S (w3[3], 0, offset); + c0[1] = hc_bytealign_S (w3[2], w3[3], offset); + c0[0] = hc_bytealign_S (w3[1], w3[2], offset); + w3[3] = hc_bytealign_S (w3[0], w3[1], offset); + w3[2] = hc_bytealign_S (w2[3], w3[0], offset); + w3[1] = hc_bytealign_S (w2[2], w2[3], offset); + w3[0] = hc_bytealign_S (w2[1], w2[2], offset); + w2[3] = hc_bytealign_S (w2[0], w2[1], offset); + w2[2] = hc_bytealign_S (w1[3], w2[0], offset); + w2[1] = hc_bytealign_S (w1[2], w1[3], offset); + w2[0] = hc_bytealign_S (w1[1], w1[2], offset); + w1[3] = hc_bytealign_S (w1[0], w1[1], offset); + w1[2] = hc_bytealign_S (w0[3], w1[0], offset); + w1[1] = hc_bytealign_S (w0[2], w0[3], offset); + w1[0] = hc_bytealign_S (w0[1], w0[2], offset); + w0[3] = hc_bytealign_S (w0[0], w0[1], offset); + w0[2] = hc_bytealign_S ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = 0; - } - break; case 3: - c0[3] = hc_bytealign_S ( 0, w3[3], offset_minus_4); - c0[2] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); - c0[1] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); - c0[0] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[3] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[2] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[1] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[0] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[3] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[2] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[1] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[0] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[3] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[2] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[1] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[0] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[3] = hc_bytealign_S (w0[0], 0, offset_minus_4); + c0[3] = hc_bytealign_S (w3[3], 0, offset); + c0[2] = hc_bytealign_S (w3[2], w3[3], offset); + c0[1] = hc_bytealign_S (w3[1], w3[2], offset); + c0[0] = hc_bytealign_S (w3[0], w3[1], offset); + w3[3] = hc_bytealign_S (w2[3], w3[0], offset); + w3[2] = hc_bytealign_S (w2[2], w2[3], offset); + w3[1] = hc_bytealign_S (w2[1], w2[2], offset); + w3[0] = hc_bytealign_S (w2[0], w2[1], offset); + w2[3] = hc_bytealign_S (w1[3], w2[0], offset); + w2[2] = hc_bytealign_S (w1[2], w1[3], offset); + w2[1] = hc_bytealign_S (w1[1], w1[2], offset); + w2[0] = hc_bytealign_S (w1[0], w1[1], offset); + w1[3] = hc_bytealign_S (w0[3], w1[0], offset); + w1[2] = hc_bytealign_S (w0[2], w0[3], offset); + w1[1] = hc_bytealign_S (w0[1], w0[2], offset); + w1[0] = hc_bytealign_S (w0[0], w0[1], offset); + w0[3] = hc_bytealign_S ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = 0; - } - break; case 4: - c1[0] = hc_bytealign_S ( 0, w3[3], offset_minus_4); - c0[3] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); - c0[2] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); - c0[1] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); - c0[0] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[3] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[2] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[1] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[0] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[3] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[2] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[1] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[0] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[3] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[2] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[1] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[0] = hc_bytealign_S (w0[0], 0, offset_minus_4); + c1[0] = hc_bytealign_S (w3[3], 0, offset); + c0[3] = hc_bytealign_S (w3[2], w3[3], offset); + c0[2] = hc_bytealign_S (w3[1], w3[2], offset); + c0[1] = hc_bytealign_S (w3[0], w3[1], offset); + c0[0] = hc_bytealign_S (w2[3], w3[0], offset); + w3[3] = hc_bytealign_S (w2[2], w2[3], offset); + w3[2] = hc_bytealign_S (w2[1], w2[2], offset); + w3[1] = hc_bytealign_S (w2[0], w2[1], offset); + w3[0] = hc_bytealign_S (w1[3], w2[0], offset); + w2[3] = hc_bytealign_S (w1[2], w1[3], offset); + w2[2] = hc_bytealign_S (w1[1], w1[2], offset); + w2[1] = hc_bytealign_S (w1[0], w1[1], offset); + w2[0] = hc_bytealign_S (w0[3], w1[0], offset); + w1[3] = hc_bytealign_S (w0[2], w0[3], offset); + w1[2] = hc_bytealign_S (w0[1], w0[2], offset); + w1[1] = hc_bytealign_S (w0[0], w0[1], offset); + w1[0] = hc_bytealign_S ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = 0; - } - break; case 5: - c1[1] = hc_bytealign_S ( 0, w3[3], offset_minus_4); - c1[0] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); - c0[3] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); - c0[2] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); - c0[1] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); - c0[0] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[3] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[2] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[1] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[0] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[3] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[2] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[1] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[0] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[3] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[2] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[1] = hc_bytealign_S (w0[0], 0, offset_minus_4); + c1[1] = hc_bytealign_S (w3[3], 0, offset); + c1[0] = hc_bytealign_S (w3[2], w3[3], offset); + c0[3] = hc_bytealign_S (w3[1], w3[2], offset); + c0[2] = hc_bytealign_S (w3[0], w3[1], offset); + c0[1] = hc_bytealign_S (w2[3], w3[0], offset); + c0[0] = hc_bytealign_S (w2[2], w2[3], offset); + w3[3] = hc_bytealign_S (w2[1], w2[2], offset); + w3[2] = hc_bytealign_S (w2[0], w2[1], offset); + w3[1] = hc_bytealign_S (w1[3], w2[0], offset); + w3[0] = hc_bytealign_S (w1[2], w1[3], offset); + w2[3] = hc_bytealign_S (w1[1], w1[2], offset); + w2[2] = hc_bytealign_S (w1[0], w1[1], offset); + w2[1] = hc_bytealign_S (w0[3], w1[0], offset); + w2[0] = hc_bytealign_S (w0[2], w0[3], offset); + w1[3] = hc_bytealign_S (w0[1], w0[2], offset); + w1[2] = hc_bytealign_S (w0[0], w0[1], offset); + w1[1] = hc_bytealign_S ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = 0; - } - break; case 6: - c1[2] = hc_bytealign_S ( 0, w3[3], offset_minus_4); - c1[1] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); - c1[0] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); - c0[3] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); - c0[2] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); - c0[1] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); - c0[0] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[3] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[2] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[1] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[0] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[3] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[2] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[1] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[0] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[3] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[2] = hc_bytealign_S (w0[0], 0, offset_minus_4); + c1[2] = hc_bytealign_S (w3[3], 0, offset); + c1[1] = hc_bytealign_S (w3[2], w3[3], offset); + c1[0] = hc_bytealign_S (w3[1], w3[2], offset); + c0[3] = hc_bytealign_S (w3[0], w3[1], offset); + c0[2] = hc_bytealign_S (w2[3], w3[0], offset); + c0[1] = hc_bytealign_S (w2[2], w2[3], offset); + c0[0] = hc_bytealign_S (w2[1], w2[2], offset); + w3[3] = hc_bytealign_S (w2[0], w2[1], offset); + w3[2] = hc_bytealign_S (w1[3], w2[0], offset); + w3[1] = hc_bytealign_S (w1[2], w1[3], offset); + w3[0] = hc_bytealign_S (w1[1], w1[2], offset); + w2[3] = hc_bytealign_S (w1[0], w1[1], offset); + w2[2] = hc_bytealign_S (w0[3], w1[0], offset); + w2[1] = hc_bytealign_S (w0[2], w0[3], offset); + w2[0] = hc_bytealign_S (w0[1], w0[2], offset); + w1[3] = hc_bytealign_S (w0[0], w0[1], offset); + w1[2] = hc_bytealign_S ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -34050,47 +33586,26 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = 0; - } - break; case 7: - c1[3] = hc_bytealign_S ( 0, w3[3], offset_minus_4); - c1[2] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); - c1[1] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); - c1[0] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); - c0[3] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); - c0[2] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); - c0[1] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); - c0[0] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[3] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[2] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[1] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[0] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[3] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[2] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[1] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[0] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[3] = hc_bytealign_S (w0[0], 0, offset_minus_4); + c1[3] = hc_bytealign_S (w3[3], 0, offset); + c1[2] = hc_bytealign_S (w3[2], w3[3], offset); + c1[1] = hc_bytealign_S (w3[1], w3[2], offset); + c1[0] = hc_bytealign_S (w3[0], w3[1], offset); + c0[3] = hc_bytealign_S (w2[3], w3[0], offset); + c0[2] = hc_bytealign_S (w2[2], w2[3], offset); + c0[1] = hc_bytealign_S (w2[1], w2[2], offset); + c0[0] = hc_bytealign_S (w2[0], w2[1], offset); + w3[3] = hc_bytealign_S (w1[3], w2[0], offset); + w3[2] = hc_bytealign_S (w1[2], w1[3], offset); + w3[1] = hc_bytealign_S (w1[1], w1[2], offset); + w3[0] = hc_bytealign_S (w1[0], w1[1], offset); + w2[3] = hc_bytealign_S (w0[3], w1[0], offset); + w2[2] = hc_bytealign_S (w0[2], w0[3], offset); + w2[1] = hc_bytealign_S (w0[1], w0[2], offset); + w2[0] = hc_bytealign_S (w0[0], w0[1], offset); + w1[3] = hc_bytealign_S ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -34099,47 +33614,26 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = c1[3]; - c1[3] = 0; - } - break; case 8: - c2[0] = hc_bytealign_S ( 0, w3[3], offset_minus_4); - c1[3] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); - c1[2] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); - c1[1] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); - c1[0] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); - c0[3] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); - c0[2] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); - c0[1] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); - c0[0] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[3] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[2] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[1] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[0] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[3] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[2] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[1] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[0] = hc_bytealign_S (w0[0], 0, offset_minus_4); + c2[0] = hc_bytealign_S (w3[3], 0, offset); + c1[3] = hc_bytealign_S (w3[2], w3[3], offset); + c1[2] = hc_bytealign_S (w3[1], w3[2], offset); + c1[1] = hc_bytealign_S (w3[0], w3[1], offset); + c1[0] = hc_bytealign_S (w2[3], w3[0], offset); + c0[3] = hc_bytealign_S (w2[2], w2[3], offset); + c0[2] = hc_bytealign_S (w2[1], w2[2], offset); + c0[1] = hc_bytealign_S (w2[0], w2[1], offset); + c0[0] = hc_bytealign_S (w1[3], w2[0], offset); + w3[3] = hc_bytealign_S (w1[2], w1[3], offset); + w3[2] = hc_bytealign_S (w1[1], w1[2], offset); + w3[1] = hc_bytealign_S (w1[0], w1[1], offset); + w3[0] = hc_bytealign_S (w0[3], w1[0], offset); + w2[3] = hc_bytealign_S (w0[2], w0[3], offset); + w2[2] = hc_bytealign_S (w0[1], w0[2], offset); + w2[1] = hc_bytealign_S (w0[0], w0[1], offset); + w2[0] = hc_bytealign_S ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -34149,47 +33643,26 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = c1[3]; - c1[3] = c2[0]; - c2[0] = 0; - } - break; case 9: - c2[1] = hc_bytealign_S ( 0, w3[3], offset_minus_4); - c2[0] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); - c1[3] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); - c1[2] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); - c1[1] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); - c1[0] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); - c0[3] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); - c0[2] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); - c0[1] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); - c0[0] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[3] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[2] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[1] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[0] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[3] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[2] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[1] = hc_bytealign_S (w0[0], 0, offset_minus_4); + c2[1] = hc_bytealign_S (w3[3], 0, offset); + c2[0] = hc_bytealign_S (w3[2], w3[3], offset); + c1[3] = hc_bytealign_S (w3[1], w3[2], offset); + c1[2] = hc_bytealign_S (w3[0], w3[1], offset); + c1[1] = hc_bytealign_S (w2[3], w3[0], offset); + c1[0] = hc_bytealign_S (w2[2], w2[3], offset); + c0[3] = hc_bytealign_S (w2[1], w2[2], offset); + c0[2] = hc_bytealign_S (w2[0], w2[1], offset); + c0[1] = hc_bytealign_S (w1[3], w2[0], offset); + c0[0] = hc_bytealign_S (w1[2], w1[3], offset); + w3[3] = hc_bytealign_S (w1[1], w1[2], offset); + w3[2] = hc_bytealign_S (w1[0], w1[1], offset); + w3[1] = hc_bytealign_S (w0[3], w1[0], offset); + w3[0] = hc_bytealign_S (w0[2], w0[3], offset); + w2[3] = hc_bytealign_S (w0[1], w0[2], offset); + w2[2] = hc_bytealign_S (w0[0], w0[1], offset); + w2[1] = hc_bytealign_S ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -34200,47 +33673,26 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = c1[3]; - c1[3] = c2[0]; - c2[0] = c2[1]; - c2[1] = 0; - } - break; case 10: - c2[2] = hc_bytealign_S ( 0, w3[3], offset_minus_4); - c2[1] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); - c2[0] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); - c1[3] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); - c1[2] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); - c1[1] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); - c1[0] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); - c0[3] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); - c0[2] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); - c0[1] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); - c0[0] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[3] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[2] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[1] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[0] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[3] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[2] = hc_bytealign_S (w0[0], 0, offset_minus_4); + c2[2] = hc_bytealign_S (w3[3], 0, offset); + c2[1] = hc_bytealign_S (w3[2], w3[3], offset); + c2[0] = hc_bytealign_S (w3[1], w3[2], offset); + c1[3] = hc_bytealign_S (w3[0], w3[1], offset); + c1[2] = hc_bytealign_S (w2[3], w3[0], offset); + c1[1] = hc_bytealign_S (w2[2], w2[3], offset); + c1[0] = hc_bytealign_S (w2[1], w2[2], offset); + c0[3] = hc_bytealign_S (w2[0], w2[1], offset); + c0[2] = hc_bytealign_S (w1[3], w2[0], offset); + c0[1] = hc_bytealign_S (w1[2], w1[3], offset); + c0[0] = hc_bytealign_S (w1[1], w1[2], offset); + w3[3] = hc_bytealign_S (w1[0], w1[1], offset); + w3[2] = hc_bytealign_S (w0[3], w1[0], offset); + w3[1] = hc_bytealign_S (w0[2], w0[3], offset); + w3[0] = hc_bytealign_S (w0[1], w0[2], offset); + w2[3] = hc_bytealign_S (w0[0], w0[1], offset); + w2[2] = hc_bytealign_S ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -34252,47 +33704,26 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = c1[3]; - c1[3] = c2[0]; - c2[0] = c2[1]; - c2[1] = c2[2]; - c2[2] = 0; - } - break; case 11: - c2[3] = hc_bytealign_S ( 0, w3[3], offset_minus_4); - c2[2] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); - c2[1] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); - c2[0] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); - c1[3] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); - c1[2] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); - c1[1] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); - c1[0] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); - c0[3] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); - c0[2] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); - c0[1] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); - c0[0] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[3] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[2] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[1] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[0] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[3] = hc_bytealign_S (w0[0], 0, offset_minus_4); + c2[3] = hc_bytealign_S (w3[3], 0, offset); + c2[2] = hc_bytealign_S (w3[2], w3[3], offset); + c2[1] = hc_bytealign_S (w3[1], w3[2], offset); + c2[0] = hc_bytealign_S (w3[0], w3[1], offset); + c1[3] = hc_bytealign_S (w2[3], w3[0], offset); + c1[2] = hc_bytealign_S (w2[2], w2[3], offset); + c1[1] = hc_bytealign_S (w2[1], w2[2], offset); + c1[0] = hc_bytealign_S (w2[0], w2[1], offset); + c0[3] = hc_bytealign_S (w1[3], w2[0], offset); + c0[2] = hc_bytealign_S (w1[2], w1[3], offset); + c0[1] = hc_bytealign_S (w1[1], w1[2], offset); + c0[0] = hc_bytealign_S (w1[0], w1[1], offset); + w3[3] = hc_bytealign_S (w0[3], w1[0], offset); + w3[2] = hc_bytealign_S (w0[2], w0[3], offset); + w3[1] = hc_bytealign_S (w0[1], w0[2], offset); + w3[0] = hc_bytealign_S (w0[0], w0[1], offset); + w2[3] = hc_bytealign_S ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -34305,47 +33736,26 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = c1[3]; - c1[3] = c2[0]; - c2[0] = c2[1]; - c2[1] = c2[2]; - c2[2] = c2[3]; - c2[3] = 0; - } - break; case 12: - c3[0] = hc_bytealign_S ( 0, w3[3], offset_minus_4); - c2[3] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); - c2[2] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); - c2[1] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); - c2[0] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); - c1[3] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); - c1[2] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); - c1[1] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); - c1[0] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); - c0[3] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); - c0[2] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); - c0[1] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); - c0[0] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[3] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[2] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[1] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[0] = hc_bytealign_S (w0[0], 0, offset_minus_4); + c3[0] = hc_bytealign_S (w3[3], 0, offset); + c2[3] = hc_bytealign_S (w3[2], w3[3], offset); + c2[2] = hc_bytealign_S (w3[1], w3[2], offset); + c2[1] = hc_bytealign_S (w3[0], w3[1], offset); + c2[0] = hc_bytealign_S (w2[3], w3[0], offset); + c1[3] = hc_bytealign_S (w2[2], w2[3], offset); + c1[2] = hc_bytealign_S (w2[1], w2[2], offset); + c1[1] = hc_bytealign_S (w2[0], w2[1], offset); + c1[0] = hc_bytealign_S (w1[3], w2[0], offset); + c0[3] = hc_bytealign_S (w1[2], w1[3], offset); + c0[2] = hc_bytealign_S (w1[1], w1[2], offset); + c0[1] = hc_bytealign_S (w1[0], w1[1], offset); + c0[0] = hc_bytealign_S (w0[3], w1[0], offset); + w3[3] = hc_bytealign_S (w0[2], w0[3], offset); + w3[2] = hc_bytealign_S (w0[1], w0[2], offset); + w3[1] = hc_bytealign_S (w0[0], w0[1], offset); + w3[0] = hc_bytealign_S ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -34359,47 +33769,26 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = c1[3]; - c1[3] = c2[0]; - c2[0] = c2[1]; - c2[1] = c2[2]; - c2[2] = c2[3]; - c2[3] = c3[0]; - c3[0] = 0; - } - break; case 13: - c3[1] = hc_bytealign_S ( 0, w3[3], offset_minus_4); - c3[0] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); - c2[3] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); - c2[2] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); - c2[1] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); - c2[0] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); - c1[3] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); - c1[2] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); - c1[1] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); - c1[0] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); - c0[3] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); - c0[2] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); - c0[1] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); - c0[0] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[3] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[2] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[1] = hc_bytealign_S (w0[0], 0, offset_minus_4); + c3[1] = hc_bytealign_S (w3[3], 0, offset); + c3[0] = hc_bytealign_S (w3[2], w3[3], offset); + c2[3] = hc_bytealign_S (w3[1], w3[2], offset); + c2[2] = hc_bytealign_S (w3[0], w3[1], offset); + c2[1] = hc_bytealign_S (w2[3], w3[0], offset); + c2[0] = hc_bytealign_S (w2[2], w2[3], offset); + c1[3] = hc_bytealign_S (w2[1], w2[2], offset); + c1[2] = hc_bytealign_S (w2[0], w2[1], offset); + c1[1] = hc_bytealign_S (w1[3], w2[0], offset); + c1[0] = hc_bytealign_S (w1[2], w1[3], offset); + c0[3] = hc_bytealign_S (w1[1], w1[2], offset); + c0[2] = hc_bytealign_S (w1[0], w1[1], offset); + c0[1] = hc_bytealign_S (w0[3], w1[0], offset); + c0[0] = hc_bytealign_S (w0[2], w0[3], offset); + w3[3] = hc_bytealign_S (w0[1], w0[2], offset); + w3[2] = hc_bytealign_S (w0[0], w0[1], offset); + w3[1] = hc_bytealign_S ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -34414,47 +33803,26 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = c1[3]; - c1[3] = c2[0]; - c2[0] = c2[1]; - c2[1] = c2[2]; - c2[2] = c2[3]; - c2[3] = c3[0]; - c3[0] = c3[1]; - c3[1] = 0; - } - break; case 14: - c3[2] = hc_bytealign_S ( 0, w3[3], offset_minus_4); - c3[1] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); - c3[0] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); - c2[3] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); - c2[2] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); - c2[1] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); - c2[0] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); - c1[3] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); - c1[2] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); - c1[1] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); - c1[0] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); - c0[3] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); - c0[2] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); - c0[1] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); - c0[0] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[3] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[2] = hc_bytealign_S (w0[0], 0, offset_minus_4); + c3[2] = hc_bytealign_S (w3[3], 0, offset); + c3[1] = hc_bytealign_S (w3[2], w3[3], offset); + c3[0] = hc_bytealign_S (w3[1], w3[2], offset); + c2[3] = hc_bytealign_S (w3[0], w3[1], offset); + c2[2] = hc_bytealign_S (w2[3], w3[0], offset); + c2[1] = hc_bytealign_S (w2[2], w2[3], offset); + c2[0] = hc_bytealign_S (w2[1], w2[2], offset); + c1[3] = hc_bytealign_S (w2[0], w2[1], offset); + c1[2] = hc_bytealign_S (w1[3], w2[0], offset); + c1[1] = hc_bytealign_S (w1[2], w1[3], offset); + c1[0] = hc_bytealign_S (w1[1], w1[2], offset); + c0[3] = hc_bytealign_S (w1[0], w1[1], offset); + c0[2] = hc_bytealign_S (w0[3], w1[0], offset); + c0[1] = hc_bytealign_S (w0[2], w0[3], offset); + c0[0] = hc_bytealign_S (w0[1], w0[2], offset); + w3[3] = hc_bytealign_S (w0[0], w0[1], offset); + w3[2] = hc_bytealign_S ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -34470,47 +33838,26 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[2] = w3[3]; - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = c1[3]; - c1[3] = c2[0]; - c2[0] = c2[1]; - c2[1] = c2[2]; - c2[2] = c2[3]; - c2[3] = c3[0]; - c3[0] = c3[1]; - c3[1] = c3[2]; - c3[2] = 0; - } - break; case 15: - c3[3] = hc_bytealign_S ( 0, w3[3], offset_minus_4); - c3[2] = hc_bytealign_S (w3[3], w3[2], offset_minus_4); - c3[1] = hc_bytealign_S (w3[2], w3[1], offset_minus_4); - c3[0] = hc_bytealign_S (w3[1], w3[0], offset_minus_4); - c2[3] = hc_bytealign_S (w3[0], w2[3], offset_minus_4); - c2[2] = hc_bytealign_S (w2[3], w2[2], offset_minus_4); - c2[1] = hc_bytealign_S (w2[2], w2[1], offset_minus_4); - c2[0] = hc_bytealign_S (w2[1], w2[0], offset_minus_4); - c1[3] = hc_bytealign_S (w2[0], w1[3], offset_minus_4); - c1[2] = hc_bytealign_S (w1[3], w1[2], offset_minus_4); - c1[1] = hc_bytealign_S (w1[2], w1[1], offset_minus_4); - c1[0] = hc_bytealign_S (w1[1], w1[0], offset_minus_4); - c0[3] = hc_bytealign_S (w1[0], w0[3], offset_minus_4); - c0[2] = hc_bytealign_S (w0[3], w0[2], offset_minus_4); - c0[1] = hc_bytealign_S (w0[2], w0[1], offset_minus_4); - c0[0] = hc_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[3] = hc_bytealign_S (w0[0], 0, offset_minus_4); + c3[3] = hc_bytealign_S (w3[3], 0, offset); + c3[2] = hc_bytealign_S (w3[2], w3[3], offset); + c3[1] = hc_bytealign_S (w3[1], w3[2], offset); + c3[0] = hc_bytealign_S (w3[0], w3[1], offset); + c2[3] = hc_bytealign_S (w2[3], w3[0], offset); + c2[2] = hc_bytealign_S (w2[2], w2[3], offset); + c2[1] = hc_bytealign_S (w2[1], w2[2], offset); + c2[0] = hc_bytealign_S (w2[0], w2[1], offset); + c1[3] = hc_bytealign_S (w1[3], w2[0], offset); + c1[2] = hc_bytealign_S (w1[2], w1[3], offset); + c1[1] = hc_bytealign_S (w1[1], w1[2], offset); + c1[0] = hc_bytealign_S (w1[0], w1[1], offset); + c0[3] = hc_bytealign_S (w0[3], w1[0], offset); + c0[2] = hc_bytealign_S (w0[2], w0[3], offset); + c0[1] = hc_bytealign_S (w0[1], w0[2], offset); + c0[0] = hc_bytealign_S (w0[0], w0[1], offset); + w3[3] = hc_bytealign_S ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -34527,27 +33874,6 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[3] = c0[0]; - c0[0] = c0[1]; - c0[1] = c0[2]; - c0[2] = c0[3]; - c0[3] = c1[0]; - c1[0] = c1[1]; - c1[1] = c1[2]; - c1[2] = c1[3]; - c1[3] = c2[0]; - c2[0] = c2[1]; - c2[1] = c2[2]; - c2[2] = c2[3]; - c2[3] = c3[0]; - c3[0] = c3[1]; - c3[1] = c3[2]; - c3[2] = c3[3]; - c3[3] = 0; - } - break; } #endif diff --git a/OpenCL/inc_rp_optimized.cl b/OpenCL/inc_rp_optimized.cl index e8095a44e..7d41e6d8d 100644 --- a/OpenCL/inc_rp_optimized.cl +++ b/OpenCL/inc_rp_optimized.cl @@ -27,11 +27,7 @@ DECLSPEC void truncate_right_optimized (u32 *buf0, u32 *buf1, const u32 offset) { const u32 tmp = (1u << ((offset & 3u) * 8u)) - 1u; - #ifdef IS_AMD const int offset_switch = offset / 4; - #else - const int offset_switch = offset / 4; - #endif switch (offset_switch) { @@ -86,11 +82,7 @@ DECLSPEC void truncate_left_optimized (u32 *buf0, u32 *buf1, const u32 offset) { const u32 tmp = ~((1u << ((offset & 3u) * 8u)) - 1u); - #ifdef IS_AMD const int offset_switch = offset / 4; - #else - const int offset_switch = offset / 4; - #endif switch (offset_switch) { @@ -143,26 +135,26 @@ DECLSPEC void truncate_left_optimized (u32 *buf0, u32 *buf1, const u32 offset) DECLSPEC void lshift_block_optimized (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1) { - out0[0] = hc_bytealign_S (in0[1], in0[0], 1); - out0[1] = hc_bytealign_S (in0[2], in0[1], 1); - out0[2] = hc_bytealign_S (in0[3], in0[2], 1); - out0[3] = hc_bytealign_S (in1[0], in0[3], 1); - out1[0] = hc_bytealign_S (in1[1], in1[0], 1); - out1[1] = hc_bytealign_S (in1[2], in1[1], 1); - out1[2] = hc_bytealign_S (in1[3], in1[2], 1); - out1[3] = hc_bytealign_S ( 0, in1[3], 1); + out0[0] = hc_bytealign_S (in0[0], in0[1], 3); + out0[1] = hc_bytealign_S (in0[1], in0[2], 3); + out0[2] = hc_bytealign_S (in0[2], in0[3], 3); + out0[3] = hc_bytealign_S (in0[3], in1[0], 3); + out1[0] = hc_bytealign_S (in1[0], in1[1], 3); + out1[1] = hc_bytealign_S (in1[1], in1[2], 3); + out1[2] = hc_bytealign_S (in1[2], in1[3], 3); + out1[3] = hc_bytealign_S (in1[3], 0, 3); } DECLSPEC void rshift_block_optimized (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1) { - out1[3] = hc_bytealign_S (in1[3], in1[2], 3); - out1[2] = hc_bytealign_S (in1[2], in1[1], 3); - out1[1] = hc_bytealign_S (in1[1], in1[0], 3); - out1[0] = hc_bytealign_S (in1[0], in0[3], 3); - out0[3] = hc_bytealign_S (in0[3], in0[2], 3); - out0[2] = hc_bytealign_S (in0[2], in0[1], 3); - out0[1] = hc_bytealign_S (in0[1], in0[0], 3); - out0[0] = hc_bytealign_S (in0[0], 0, 3); + out1[3] = hc_bytealign_S (in1[2], in1[3], 1); + out1[2] = hc_bytealign_S (in1[1], in1[2], 1); + out1[1] = hc_bytealign_S (in1[0], in1[1], 1); + out1[0] = hc_bytealign_S (in0[3], in1[0], 1); + out0[3] = hc_bytealign_S (in0[2], in0[3], 1); + out0[2] = hc_bytealign_S (in0[1], in0[2], 1); + out0[1] = hc_bytealign_S (in0[0], in0[1], 1); + out0[0] = hc_bytealign_S ( 0, in0[0], 1); } DECLSPEC void lshift_block_optimized_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const u32 num) @@ -178,32 +170,32 @@ DECLSPEC void lshift_block_optimized_N (const u32 *in0, const u32 *in1, u32 *out out1[2] = in1[2]; out1[3] = in1[3]; break; - case 1: out0[0] = hc_bytealign_S (in0[1], in0[0], 1); - out0[1] = hc_bytealign_S (in0[2], in0[1], 1); - out0[2] = hc_bytealign_S (in0[3], in0[2], 1); - out0[3] = hc_bytealign_S (in1[0], in0[3], 1); - out1[0] = hc_bytealign_S (in1[1], in1[0], 1); - out1[1] = hc_bytealign_S (in1[2], in1[1], 1); - out1[2] = hc_bytealign_S (in1[3], in1[2], 1); - out1[3] = hc_bytealign_S ( 0, in1[3], 1); + case 1: out0[0] = hc_bytealign_S (in0[0], in0[1], 3); + out0[1] = hc_bytealign_S (in0[1], in0[2], 3); + out0[2] = hc_bytealign_S (in0[2], in0[3], 3); + out0[3] = hc_bytealign_S (in0[3], in1[0], 3); + out1[0] = hc_bytealign_S (in1[0], in1[1], 3); + out1[1] = hc_bytealign_S (in1[1], in1[2], 3); + out1[2] = hc_bytealign_S (in1[2], in1[3], 3); + out1[3] = hc_bytealign_S (in1[3], 0, 3); break; - case 2: out0[0] = hc_bytealign_S (in0[1], in0[0], 2); - out0[1] = hc_bytealign_S (in0[2], in0[1], 2); - out0[2] = hc_bytealign_S (in0[3], in0[2], 2); - out0[3] = hc_bytealign_S (in1[0], in0[3], 2); - out1[0] = hc_bytealign_S (in1[1], in1[0], 2); - out1[1] = hc_bytealign_S (in1[2], in1[1], 2); - out1[2] = hc_bytealign_S (in1[3], in1[2], 2); - out1[3] = hc_bytealign_S ( 0, in1[3], 2); + case 2: out0[0] = hc_bytealign_S (in0[0], in0[1], 2); + out0[1] = hc_bytealign_S (in0[1], in0[2], 2); + out0[2] = hc_bytealign_S (in0[2], in0[3], 2); + out0[3] = hc_bytealign_S (in0[3], in1[0], 2); + out1[0] = hc_bytealign_S (in1[0], in1[1], 2); + out1[1] = hc_bytealign_S (in1[1], in1[2], 2); + out1[2] = hc_bytealign_S (in1[2], in1[3], 2); + out1[3] = hc_bytealign_S (in1[3], 0, 2); break; - case 3: out0[0] = hc_bytealign_S (in0[1], in0[0], 3); - out0[1] = hc_bytealign_S (in0[2], in0[1], 3); - out0[2] = hc_bytealign_S (in0[3], in0[2], 3); - out0[3] = hc_bytealign_S (in1[0], in0[3], 3); - out1[0] = hc_bytealign_S (in1[1], in1[0], 3); - out1[1] = hc_bytealign_S (in1[2], in1[1], 3); - out1[2] = hc_bytealign_S (in1[3], in1[2], 3); - out1[3] = hc_bytealign_S ( 0, in1[3], 3); + case 3: out0[0] = hc_bytealign_S (in0[0], in0[1], 1); + out0[1] = hc_bytealign_S (in0[1], in0[2], 1); + out0[2] = hc_bytealign_S (in0[2], in0[3], 1); + out0[3] = hc_bytealign_S (in0[3], in1[0], 1); + out1[0] = hc_bytealign_S (in1[0], in1[1], 1); + out1[1] = hc_bytealign_S (in1[1], in1[2], 1); + out1[2] = hc_bytealign_S (in1[2], in1[3], 1); + out1[3] = hc_bytealign_S (in1[3], 0, 1); break; case 4: out0[0] = in0[1]; out0[1] = in0[2]; @@ -214,31 +206,31 @@ DECLSPEC void lshift_block_optimized_N (const u32 *in0, const u32 *in1, u32 *out out1[2] = in1[3]; out1[3] = 0; break; - case 5: out0[0] = hc_bytealign_S (in0[2], in0[1], 1); - out0[1] = hc_bytealign_S (in0[3], in0[2], 1); - out0[2] = hc_bytealign_S (in1[0], in0[3], 1); - out0[3] = hc_bytealign_S (in1[1], in1[0], 1); - out1[0] = hc_bytealign_S (in1[2], in1[1], 1); - out1[1] = hc_bytealign_S (in1[3], in1[2], 1); - out1[2] = hc_bytealign_S ( 0, in1[3], 1); + case 5: out0[0] = hc_bytealign_S (in0[1], in0[2], 3); + out0[1] = hc_bytealign_S (in0[2], in0[3], 3); + out0[2] = hc_bytealign_S (in0[3], in1[0], 3); + out0[3] = hc_bytealign_S (in1[0], in1[1], 3); + out1[0] = hc_bytealign_S (in1[1], in1[2], 3); + out1[1] = hc_bytealign_S (in1[2], in1[3], 3); + out1[2] = hc_bytealign_S (in1[3], 0, 3); out1[3] = 0; break; - case 6: out0[0] = hc_bytealign_S (in0[2], in0[1], 2); - out0[1] = hc_bytealign_S (in0[3], in0[2], 2); - out0[2] = hc_bytealign_S (in1[0], in0[3], 2); - out0[3] = hc_bytealign_S (in1[1], in1[0], 2); - out1[0] = hc_bytealign_S (in1[2], in1[1], 2); - out1[1] = hc_bytealign_S (in1[3], in1[2], 2); - out1[2] = hc_bytealign_S ( 0, in1[3], 2); + case 6: out0[0] = hc_bytealign_S (in0[1], in0[2], 2); + out0[1] = hc_bytealign_S (in0[2], in0[3], 2); + out0[2] = hc_bytealign_S (in0[3], in1[0], 2); + out0[3] = hc_bytealign_S (in1[0], in1[1], 2); + out1[0] = hc_bytealign_S (in1[1], in1[2], 2); + out1[1] = hc_bytealign_S (in1[2], in1[3], 2); + out1[2] = hc_bytealign_S (in1[3], 0, 2); out1[3] = 0; break; - case 7: out0[0] = hc_bytealign_S (in0[2], in0[1], 3); - out0[1] = hc_bytealign_S (in0[3], in0[2], 3); - out0[2] = hc_bytealign_S (in1[0], in0[3], 3); - out0[3] = hc_bytealign_S (in1[1], in1[0], 3); - out1[0] = hc_bytealign_S (in1[2], in1[1], 3); - out1[1] = hc_bytealign_S (in1[3], in1[2], 3); - out1[2] = hc_bytealign_S ( 0, in1[3], 3); + case 7: out0[0] = hc_bytealign_S (in0[1], in0[2], 1); + out0[1] = hc_bytealign_S (in0[2], in0[3], 1); + out0[2] = hc_bytealign_S (in0[3], in1[0], 1); + out0[3] = hc_bytealign_S (in1[0], in1[1], 1); + out1[0] = hc_bytealign_S (in1[1], in1[2], 1); + out1[1] = hc_bytealign_S (in1[2], in1[3], 1); + out1[2] = hc_bytealign_S (in1[3], 0, 1); out1[3] = 0; break; case 8: out0[0] = in0[2]; @@ -250,30 +242,30 @@ DECLSPEC void lshift_block_optimized_N (const u32 *in0, const u32 *in1, u32 *out out1[2] = 0; out1[3] = 0; break; - case 9: out0[0] = hc_bytealign_S (in0[3], in0[2], 1); - out0[1] = hc_bytealign_S (in1[0], in0[3], 1); - out0[2] = hc_bytealign_S (in1[1], in1[0], 1); - out0[3] = hc_bytealign_S (in1[2], in1[1], 1); - out1[0] = hc_bytealign_S (in1[3], in1[2], 1); - out1[1] = hc_bytealign_S ( 0, in1[3], 1); + case 9: out0[0] = hc_bytealign_S (in0[2], in0[3], 3); + out0[1] = hc_bytealign_S (in0[3], in1[0], 3); + out0[2] = hc_bytealign_S (in1[0], in1[1], 3); + out0[3] = hc_bytealign_S (in1[1], in1[2], 3); + out1[0] = hc_bytealign_S (in1[2], in1[3], 3); + out1[1] = hc_bytealign_S (in1[3], 0, 3); out1[2] = 0; out1[3] = 0; break; - case 10: out0[0] = hc_bytealign_S (in0[3], in0[2], 2); - out0[1] = hc_bytealign_S (in1[0], in0[3], 2); - out0[2] = hc_bytealign_S (in1[1], in1[0], 2); - out0[3] = hc_bytealign_S (in1[2], in1[1], 2); - out1[0] = hc_bytealign_S (in1[3], in1[2], 2); - out1[1] = hc_bytealign_S ( 0, in1[3], 2); + case 10: out0[0] = hc_bytealign_S (in0[2], in0[3], 2); + out0[1] = hc_bytealign_S (in0[3], in1[0], 2); + out0[2] = hc_bytealign_S (in1[0], in1[1], 2); + out0[3] = hc_bytealign_S (in1[1], in1[2], 2); + out1[0] = hc_bytealign_S (in1[2], in1[3], 2); + out1[1] = hc_bytealign_S (in1[3], 0, 2); out1[2] = 0; out1[3] = 0; break; - case 11: out0[0] = hc_bytealign_S (in0[3], in0[2], 3); - out0[1] = hc_bytealign_S (in1[0], in0[3], 3); - out0[2] = hc_bytealign_S (in1[1], in1[0], 3); - out0[3] = hc_bytealign_S (in1[2], in1[1], 3); - out1[0] = hc_bytealign_S (in1[3], in1[2], 3); - out1[1] = hc_bytealign_S ( 0, in1[3], 3); + case 11: out0[0] = hc_bytealign_S (in0[2], in0[3], 1); + out0[1] = hc_bytealign_S (in0[3], in1[0], 1); + out0[2] = hc_bytealign_S (in1[0], in1[1], 1); + out0[3] = hc_bytealign_S (in1[1], in1[2], 1); + out1[0] = hc_bytealign_S (in1[2], in1[3], 1); + out1[1] = hc_bytealign_S (in1[3], 0, 1); out1[2] = 0; out1[3] = 0; break; @@ -286,29 +278,29 @@ DECLSPEC void lshift_block_optimized_N (const u32 *in0, const u32 *in1, u32 *out out1[2] = 0; out1[3] = 0; break; - case 13: out0[0] = hc_bytealign_S (in1[0], in0[3], 1); - out0[1] = hc_bytealign_S (in1[1], in1[0], 1); - out0[2] = hc_bytealign_S (in1[2], in1[1], 1); - out0[3] = hc_bytealign_S (in1[3], in1[2], 1); - out1[0] = hc_bytealign_S ( 0, in1[3], 1); + case 13: out0[0] = hc_bytealign_S (in0[3], in1[0], 3); + out0[1] = hc_bytealign_S (in1[0], in1[1], 3); + out0[2] = hc_bytealign_S (in1[1], in1[2], 3); + out0[3] = hc_bytealign_S (in1[2], in1[3], 3); + out1[0] = hc_bytealign_S (in1[3], 0, 3); out1[1] = 0; out1[2] = 0; out1[3] = 0; break; - case 14: out0[0] = hc_bytealign_S (in1[0], in0[3], 2); - out0[1] = hc_bytealign_S (in1[1], in1[0], 2); - out0[2] = hc_bytealign_S (in1[2], in1[1], 2); - out0[3] = hc_bytealign_S (in1[3], in1[2], 2); - out1[0] = hc_bytealign_S ( 0, in1[3], 2); + case 14: out0[0] = hc_bytealign_S (in0[3], in1[0], 2); + out0[1] = hc_bytealign_S (in1[0], in1[1], 2); + out0[2] = hc_bytealign_S (in1[1], in1[2], 2); + out0[3] = hc_bytealign_S (in1[2], in1[3], 2); + out1[0] = hc_bytealign_S (in1[3], 0, 2); out1[1] = 0; out1[2] = 0; out1[3] = 0; break; - case 15: out0[0] = hc_bytealign_S (in1[0], in0[3], 3); - out0[1] = hc_bytealign_S (in1[1], in1[0], 3); - out0[2] = hc_bytealign_S (in1[2], in1[1], 3); - out0[3] = hc_bytealign_S (in1[3], in1[2], 3); - out1[0] = hc_bytealign_S ( 0, in1[3], 3); + case 15: out0[0] = hc_bytealign_S (in0[3], in1[0], 1); + out0[1] = hc_bytealign_S (in1[0], in1[1], 1); + out0[2] = hc_bytealign_S (in1[1], in1[2], 1); + out0[3] = hc_bytealign_S (in1[2], in1[3], 1); + out1[0] = hc_bytealign_S (in1[3], 0, 1); out1[1] = 0; out1[2] = 0; out1[3] = 0; @@ -322,28 +314,28 @@ DECLSPEC void lshift_block_optimized_N (const u32 *in0, const u32 *in1, u32 *out out1[2] = 0; out1[3] = 0; break; - case 17: out0[0] = hc_bytealign_S (in1[1], in1[0], 1); - out0[1] = hc_bytealign_S (in1[2], in1[1], 1); - out0[2] = hc_bytealign_S (in1[3], in1[2], 1); - out0[3] = hc_bytealign_S ( 0, in1[3], 1); + case 17: out0[0] = hc_bytealign_S (in1[0], in1[1], 3); + out0[1] = hc_bytealign_S (in1[1], in1[2], 3); + out0[2] = hc_bytealign_S (in1[2], in1[3], 3); + out0[3] = hc_bytealign_S (in1[3], 0, 3); out1[0] = 0; out1[1] = 0; out1[2] = 0; out1[3] = 0; break; - case 18: out0[0] = hc_bytealign_S (in1[1], in1[0], 2); - out0[1] = hc_bytealign_S (in1[2], in1[1], 2); - out0[2] = hc_bytealign_S (in1[3], in1[2], 2); - out0[3] = hc_bytealign_S ( 0, in1[3], 2); + case 18: out0[0] = hc_bytealign_S (in1[0], in1[1], 2); + out0[1] = hc_bytealign_S (in1[1], in1[2], 2); + out0[2] = hc_bytealign_S (in1[2], in1[3], 2); + out0[3] = hc_bytealign_S (in1[3], 0, 2); out1[0] = 0; out1[1] = 0; out1[2] = 0; out1[3] = 0; break; - case 19: out0[0] = hc_bytealign_S (in1[1], in1[0], 3); - out0[1] = hc_bytealign_S (in1[2], in1[1], 3); - out0[2] = hc_bytealign_S (in1[3], in1[2], 3); - out0[3] = hc_bytealign_S ( 0, in1[3], 3); + case 19: out0[0] = hc_bytealign_S (in1[0], in1[1], 1); + out0[1] = hc_bytealign_S (in1[1], in1[2], 1); + out0[2] = hc_bytealign_S (in1[2], in1[3], 1); + out0[3] = hc_bytealign_S (in1[3], 0, 1); out1[0] = 0; out1[1] = 0; out1[2] = 0; @@ -358,27 +350,27 @@ DECLSPEC void lshift_block_optimized_N (const u32 *in0, const u32 *in1, u32 *out out1[2] = 0; out1[3] = 0; break; - case 21: out0[0] = hc_bytealign_S (in1[2], in1[1], 1); - out0[1] = hc_bytealign_S (in1[3], in1[2], 1); - out0[2] = hc_bytealign_S ( 0, in1[3], 1); + case 21: out0[0] = hc_bytealign_S (in1[1], in1[2], 3); + out0[1] = hc_bytealign_S (in1[2], in1[3], 3); + out0[2] = hc_bytealign_S (in1[3], 0, 3); out0[3] = 0; out1[0] = 0; out1[1] = 0; out1[2] = 0; out1[3] = 0; break; - case 22: out0[0] = hc_bytealign_S (in1[2], in1[1], 2); - out0[1] = hc_bytealign_S (in1[3], in1[2], 2); - out0[2] = hc_bytealign_S ( 0, in1[3], 2); + case 22: out0[0] = hc_bytealign_S (in1[1], in1[2], 2); + out0[1] = hc_bytealign_S (in1[2], in1[3], 2); + out0[2] = hc_bytealign_S (in1[3], 0, 2); out0[3] = 0; out1[0] = 0; out1[1] = 0; out1[2] = 0; out1[3] = 0; break; - case 23: out0[0] = hc_bytealign_S (in1[2], in1[1], 3); - out0[1] = hc_bytealign_S (in1[3], in1[2], 3); - out0[2] = hc_bytealign_S ( 0, in1[3], 3); + case 23: out0[0] = hc_bytealign_S (in1[1], in1[2], 1); + out0[1] = hc_bytealign_S (in1[2], in1[3], 1); + out0[2] = hc_bytealign_S (in1[3], 0, 1); out0[3] = 0; out1[0] = 0; out1[1] = 0; @@ -394,8 +386,8 @@ DECLSPEC void lshift_block_optimized_N (const u32 *in0, const u32 *in1, u32 *out out1[2] = 0; out1[3] = 0; break; - case 25: out0[0] = hc_bytealign_S (in1[3], in1[2], 1); - out0[1] = hc_bytealign_S ( 0, in1[3], 1); + case 25: out0[0] = hc_bytealign_S (in1[2], in1[3], 3); + out0[1] = hc_bytealign_S (in1[3], 0, 3); out0[2] = 0; out0[3] = 0; out1[0] = 0; @@ -403,8 +395,8 @@ DECLSPEC void lshift_block_optimized_N (const u32 *in0, const u32 *in1, u32 *out out1[2] = 0; out1[3] = 0; break; - case 26: out0[0] = hc_bytealign_S (in1[3], in1[2], 2); - out0[1] = hc_bytealign_S ( 0, in1[3], 2); + case 26: out0[0] = hc_bytealign_S (in1[2], in1[3], 2); + out0[1] = hc_bytealign_S (in1[3], 0, 2); out0[2] = 0; out0[3] = 0; out1[0] = 0; @@ -412,8 +404,8 @@ DECLSPEC void lshift_block_optimized_N (const u32 *in0, const u32 *in1, u32 *out out1[2] = 0; out1[3] = 0; break; - case 27: out0[0] = hc_bytealign_S (in1[3], in1[2], 3); - out0[1] = hc_bytealign_S ( 0, in1[3], 3); + case 27: out0[0] = hc_bytealign_S (in1[2], in1[3], 1); + out0[1] = hc_bytealign_S (in1[3], 0, 1); out0[2] = 0; out0[3] = 0; out1[0] = 0; @@ -430,7 +422,7 @@ DECLSPEC void lshift_block_optimized_N (const u32 *in0, const u32 *in1, u32 *out out1[2] = 0; out1[3] = 0; break; - case 29: out0[0] = hc_bytealign_S ( 0, in1[3], 1); + case 29: out0[0] = hc_bytealign_S (in1[3], 0, 3); out0[1] = 0; out0[2] = 0; out0[3] = 0; @@ -439,7 +431,7 @@ DECLSPEC void lshift_block_optimized_N (const u32 *in0, const u32 *in1, u32 *out out1[2] = 0; out1[3] = 0; break; - case 30: out0[0] = hc_bytealign_S ( 0, in1[3], 2); + case 30: out0[0] = hc_bytealign_S (in1[3], 0, 2); out0[1] = 0; out0[2] = 0; out0[3] = 0; @@ -448,7 +440,7 @@ DECLSPEC void lshift_block_optimized_N (const u32 *in0, const u32 *in1, u32 *out out1[2] = 0; out1[3] = 0; break; - case 31: out0[0] = hc_bytealign_S ( 0, in1[3], 3); + case 31: out0[0] = hc_bytealign_S (in1[3], 0, 1); out0[1] = 0; out0[2] = 0; out0[3] = 0; @@ -473,32 +465,32 @@ DECLSPEC void rshift_block_optimized_N (const u32 *in0, const u32 *in1, u32 *out out0[1] = in0[1]; out0[0] = in0[0]; break; - case 1: out1[3] = hc_bytealign_S (in1[3], in1[2], 3); - out1[2] = hc_bytealign_S (in1[2], in1[1], 3); - out1[1] = hc_bytealign_S (in1[1], in1[0], 3); - out1[0] = hc_bytealign_S (in1[0], in0[3], 3); - out0[3] = hc_bytealign_S (in0[3], in0[2], 3); - out0[2] = hc_bytealign_S (in0[2], in0[1], 3); - out0[1] = hc_bytealign_S (in0[1], in0[0], 3); - out0[0] = hc_bytealign_S (in0[0], 0, 3); + case 1: out1[3] = hc_bytealign_S (in1[2], in1[3], 1); + out1[2] = hc_bytealign_S (in1[1], in1[2], 1); + out1[1] = hc_bytealign_S (in1[0], in1[1], 1); + out1[0] = hc_bytealign_S (in0[3], in1[0], 1); + out0[3] = hc_bytealign_S (in0[2], in0[3], 1); + out0[2] = hc_bytealign_S (in0[1], in0[2], 1); + out0[1] = hc_bytealign_S (in0[0], in0[1], 1); + out0[0] = hc_bytealign_S ( 0, in0[0], 1); break; - case 2: out1[3] = hc_bytealign_S (in1[3], in1[2], 2); - out1[2] = hc_bytealign_S (in1[2], in1[1], 2); - out1[1] = hc_bytealign_S (in1[1], in1[0], 2); - out1[0] = hc_bytealign_S (in1[0], in0[3], 2); - out0[3] = hc_bytealign_S (in0[3], in0[2], 2); - out0[2] = hc_bytealign_S (in0[2], in0[1], 2); - out0[1] = hc_bytealign_S (in0[1], in0[0], 2); - out0[0] = hc_bytealign_S (in0[0], 0, 2); + case 2: out1[3] = hc_bytealign_S (in1[2], in1[3], 2); + out1[2] = hc_bytealign_S (in1[1], in1[2], 2); + out1[1] = hc_bytealign_S (in1[0], in1[1], 2); + out1[0] = hc_bytealign_S (in0[3], in1[0], 2); + out0[3] = hc_bytealign_S (in0[2], in0[3], 2); + out0[2] = hc_bytealign_S (in0[1], in0[2], 2); + out0[1] = hc_bytealign_S (in0[0], in0[1], 2); + out0[0] = hc_bytealign_S ( 0, in0[0], 2); break; - case 3: out1[3] = hc_bytealign_S (in1[3], in1[2], 1); - out1[2] = hc_bytealign_S (in1[2], in1[1], 1); - out1[1] = hc_bytealign_S (in1[1], in1[0], 1); - out1[0] = hc_bytealign_S (in1[0], in0[3], 1); - out0[3] = hc_bytealign_S (in0[3], in0[2], 1); - out0[2] = hc_bytealign_S (in0[2], in0[1], 1); - out0[1] = hc_bytealign_S (in0[1], in0[0], 1); - out0[0] = hc_bytealign_S (in0[0], 0, 1); + case 3: out1[3] = hc_bytealign_S (in1[2], in1[3], 3); + out1[2] = hc_bytealign_S (in1[1], in1[2], 3); + out1[1] = hc_bytealign_S (in1[0], in1[1], 3); + out1[0] = hc_bytealign_S (in0[3], in1[0], 3); + out0[3] = hc_bytealign_S (in0[2], in0[3], 3); + out0[2] = hc_bytealign_S (in0[1], in0[2], 3); + out0[1] = hc_bytealign_S (in0[0], in0[1], 3); + out0[0] = hc_bytealign_S ( 0, in0[0], 3); break; case 4: out1[3] = in1[2]; out1[2] = in1[1]; @@ -509,31 +501,31 @@ DECLSPEC void rshift_block_optimized_N (const u32 *in0, const u32 *in1, u32 *out out0[1] = in0[0]; out0[0] = 0; break; - case 5: out1[3] = hc_bytealign_S (in1[2], in1[1], 3); - out1[2] = hc_bytealign_S (in1[1], in1[0], 3); - out1[1] = hc_bytealign_S (in1[0], in0[3], 3); - out1[0] = hc_bytealign_S (in0[3], in0[2], 3); - out0[3] = hc_bytealign_S (in0[2], in0[1], 3); - out0[2] = hc_bytealign_S (in0[1], in0[0], 3); - out0[1] = hc_bytealign_S (in0[0], 0, 3); + case 5: out1[3] = hc_bytealign_S (in1[1], in1[2], 1); + out1[2] = hc_bytealign_S (in1[0], in1[1], 1); + out1[1] = hc_bytealign_S (in0[3], in1[0], 1); + out1[0] = hc_bytealign_S (in0[2], in0[3], 1); + out0[3] = hc_bytealign_S (in0[1], in0[2], 1); + out0[2] = hc_bytealign_S (in0[0], in0[1], 1); + out0[1] = hc_bytealign_S ( 0, in0[0], 1); out0[0] = 0; break; - case 6: out1[3] = hc_bytealign_S (in1[2], in1[1], 2); - out1[2] = hc_bytealign_S (in1[1], in1[0], 2); - out1[1] = hc_bytealign_S (in1[0], in0[3], 2); - out1[0] = hc_bytealign_S (in0[3], in0[2], 2); - out0[3] = hc_bytealign_S (in0[2], in0[1], 2); - out0[2] = hc_bytealign_S (in0[1], in0[0], 2); - out0[1] = hc_bytealign_S (in0[0], 0, 2); + case 6: out1[3] = hc_bytealign_S (in1[1], in1[2], 2); + out1[2] = hc_bytealign_S (in1[0], in1[1], 2); + out1[1] = hc_bytealign_S (in0[3], in1[0], 2); + out1[0] = hc_bytealign_S (in0[2], in0[3], 2); + out0[3] = hc_bytealign_S (in0[1], in0[2], 2); + out0[2] = hc_bytealign_S (in0[0], in0[1], 2); + out0[1] = hc_bytealign_S ( 0, in0[0], 2); out0[0] = 0; break; - case 7: out1[3] = hc_bytealign_S (in1[2], in1[1], 1); - out1[2] = hc_bytealign_S (in1[1], in1[0], 1); - out1[1] = hc_bytealign_S (in1[0], in0[3], 1); - out1[0] = hc_bytealign_S (in0[3], in0[2], 1); - out0[3] = hc_bytealign_S (in0[2], in0[1], 1); - out0[2] = hc_bytealign_S (in0[1], in0[0], 1); - out0[1] = hc_bytealign_S (in0[0], 0, 1); + case 7: out1[3] = hc_bytealign_S (in1[1], in1[2], 3); + out1[2] = hc_bytealign_S (in1[0], in1[1], 3); + out1[1] = hc_bytealign_S (in0[3], in1[0], 3); + out1[0] = hc_bytealign_S (in0[2], in0[3], 3); + out0[3] = hc_bytealign_S (in0[1], in0[2], 3); + out0[2] = hc_bytealign_S (in0[0], in0[1], 3); + out0[1] = hc_bytealign_S ( 0, in0[0], 3); out0[0] = 0; break; case 8: out1[3] = in1[1]; @@ -545,30 +537,30 @@ DECLSPEC void rshift_block_optimized_N (const u32 *in0, const u32 *in1, u32 *out out0[1] = 0; out0[0] = 0; break; - case 9: out1[3] = hc_bytealign_S (in1[1], in1[0], 3); - out1[2] = hc_bytealign_S (in1[0], in0[3], 3); - out1[1] = hc_bytealign_S (in0[3], in0[2], 3); - out1[0] = hc_bytealign_S (in0[2], in0[1], 3); - out0[3] = hc_bytealign_S (in0[1], in0[0], 3); - out0[2] = hc_bytealign_S (in0[0], 0, 3); + case 9: out1[3] = hc_bytealign_S (in1[0], in1[1], 1); + out1[2] = hc_bytealign_S (in0[3], in1[0], 1); + out1[1] = hc_bytealign_S (in0[2], in0[3], 1); + out1[0] = hc_bytealign_S (in0[1], in0[2], 1); + out0[3] = hc_bytealign_S (in0[0], in0[1], 1); + out0[2] = hc_bytealign_S ( 0, in0[0], 1); out0[1] = 0; out0[0] = 0; break; - case 10: out1[3] = hc_bytealign_S (in1[1], in1[0], 2); - out1[2] = hc_bytealign_S (in1[0], in0[3], 2); - out1[1] = hc_bytealign_S (in0[3], in0[2], 2); - out1[0] = hc_bytealign_S (in0[2], in0[1], 2); - out0[3] = hc_bytealign_S (in0[1], in0[0], 2); - out0[2] = hc_bytealign_S (in0[0], 0, 2); + case 10: out1[3] = hc_bytealign_S (in1[0], in1[1], 2); + out1[2] = hc_bytealign_S (in0[3], in1[0], 2); + out1[1] = hc_bytealign_S (in0[2], in0[3], 2); + out1[0] = hc_bytealign_S (in0[1], in0[2], 2); + out0[3] = hc_bytealign_S (in0[0], in0[1], 2); + out0[2] = hc_bytealign_S ( 0, in0[0], 2); out0[1] = 0; out0[0] = 0; break; - case 11: out1[3] = hc_bytealign_S (in1[1], in1[0], 1); - out1[2] = hc_bytealign_S (in1[0], in0[3], 1); - out1[1] = hc_bytealign_S (in0[3], in0[2], 1); - out1[0] = hc_bytealign_S (in0[2], in0[1], 1); - out0[3] = hc_bytealign_S (in0[1], in0[0], 1); - out0[2] = hc_bytealign_S (in0[0], 0, 1); + case 11: out1[3] = hc_bytealign_S (in1[0], in1[1], 3); + out1[2] = hc_bytealign_S (in0[3], in1[0], 3); + out1[1] = hc_bytealign_S (in0[2], in0[3], 3); + out1[0] = hc_bytealign_S (in0[1], in0[2], 3); + out0[3] = hc_bytealign_S (in0[0], in0[1], 3); + out0[2] = hc_bytealign_S ( 0, in0[0], 3); out0[1] = 0; out0[0] = 0; break; @@ -581,29 +573,29 @@ DECLSPEC void rshift_block_optimized_N (const u32 *in0, const u32 *in1, u32 *out out0[1] = 0; out0[0] = 0; break; - case 13: out1[3] = hc_bytealign_S (in1[0], in0[3], 3); - out1[2] = hc_bytealign_S (in0[3], in0[2], 3); - out1[1] = hc_bytealign_S (in0[2], in0[1], 3); - out1[0] = hc_bytealign_S (in0[1], in0[0], 3); - out0[3] = hc_bytealign_S (in0[0], 0, 3); + case 13: out1[3] = hc_bytealign_S (in0[3], in1[0], 1); + out1[2] = hc_bytealign_S (in0[2], in0[3], 1); + out1[1] = hc_bytealign_S (in0[1], in0[2], 1); + out1[0] = hc_bytealign_S (in0[0], in0[1], 1); + out0[3] = hc_bytealign_S ( 0, in0[0], 1); out0[2] = 0; out0[1] = 0; out0[0] = 0; break; - case 14: out1[3] = hc_bytealign_S (in1[0], in0[3], 2); - out1[2] = hc_bytealign_S (in0[3], in0[2], 2); - out1[1] = hc_bytealign_S (in0[2], in0[1], 2); - out1[0] = hc_bytealign_S (in0[1], in0[0], 2); - out0[3] = hc_bytealign_S (in0[0], 0, 2); + case 14: out1[3] = hc_bytealign_S (in0[3], in1[0], 2); + out1[2] = hc_bytealign_S (in0[2], in0[3], 2); + out1[1] = hc_bytealign_S (in0[1], in0[2], 2); + out1[0] = hc_bytealign_S (in0[0], in0[1], 2); + out0[3] = hc_bytealign_S ( 0, in0[0], 2); out0[2] = 0; out0[1] = 0; out0[0] = 0; break; - case 15: out1[3] = hc_bytealign_S (in1[0], in0[3], 1); - out1[2] = hc_bytealign_S (in0[3], in0[2], 1); - out1[1] = hc_bytealign_S (in0[2], in0[1], 1); - out1[0] = hc_bytealign_S (in0[1], in0[0], 1); - out0[3] = hc_bytealign_S (in0[0], 0, 1); + case 15: out1[3] = hc_bytealign_S (in0[3], in1[0], 3); + out1[2] = hc_bytealign_S (in0[2], in0[3], 3); + out1[1] = hc_bytealign_S (in0[1], in0[2], 3); + out1[0] = hc_bytealign_S (in0[0], in0[1], 3); + out0[3] = hc_bytealign_S ( 0, in0[0], 3); out0[2] = 0; out0[1] = 0; out0[0] = 0; @@ -617,28 +609,28 @@ DECLSPEC void rshift_block_optimized_N (const u32 *in0, const u32 *in1, u32 *out out0[1] = 0; out0[0] = 0; break; - case 17: out1[3] = hc_bytealign_S (in0[3], in0[2], 3); - out1[2] = hc_bytealign_S (in0[2], in0[1], 3); - out1[1] = hc_bytealign_S (in0[1], in0[0], 3); - out1[0] = hc_bytealign_S (in0[0], 0, 3); + case 17: out1[3] = hc_bytealign_S (in0[2], in0[3], 1); + out1[2] = hc_bytealign_S (in0[1], in0[2], 1); + out1[1] = hc_bytealign_S (in0[0], in0[1], 1); + out1[0] = hc_bytealign_S ( 0, in0[0], 1); out0[3] = 0; out0[2] = 0; out0[1] = 0; out0[0] = 0; break; - case 18: out1[3] = hc_bytealign_S (in0[3], in0[2], 2); - out1[2] = hc_bytealign_S (in0[2], in0[1], 2); - out1[1] = hc_bytealign_S (in0[1], in0[0], 2); - out1[0] = hc_bytealign_S (in0[0], 0, 2); + case 18: out1[3] = hc_bytealign_S (in0[2], in0[3], 2); + out1[2] = hc_bytealign_S (in0[1], in0[2], 2); + out1[1] = hc_bytealign_S (in0[0], in0[1], 2); + out1[0] = hc_bytealign_S ( 0, in0[0], 2); out0[3] = 0; out0[2] = 0; out0[1] = 0; out0[0] = 0; break; - case 19: out1[3] = hc_bytealign_S (in0[3], in0[2], 1); - out1[2] = hc_bytealign_S (in0[2], in0[1], 1); - out1[1] = hc_bytealign_S (in0[1], in0[0], 1); - out1[0] = hc_bytealign_S (in0[0], 0, 1); + case 19: out1[3] = hc_bytealign_S (in0[2], in0[3], 3); + out1[2] = hc_bytealign_S (in0[1], in0[2], 3); + out1[1] = hc_bytealign_S (in0[0], in0[1], 3); + out1[0] = hc_bytealign_S ( 0, in0[0], 3); out0[3] = 0; out0[2] = 0; out0[1] = 0; @@ -653,27 +645,27 @@ DECLSPEC void rshift_block_optimized_N (const u32 *in0, const u32 *in1, u32 *out out0[1] = 0; out0[0] = 0; break; - case 21: out1[3] = hc_bytealign_S (in0[2], in0[1], 3); - out1[2] = hc_bytealign_S (in0[1], in0[0], 3); - out1[1] = hc_bytealign_S (in0[0], 0, 3); + case 21: out1[3] = hc_bytealign_S (in0[1], in0[2], 1); + out1[2] = hc_bytealign_S (in0[0], in0[1], 1); + out1[1] = hc_bytealign_S ( 0, in0[0], 1); out1[0] = 0; out0[3] = 0; out0[2] = 0; out0[1] = 0; out0[0] = 0; break; - case 22: out1[3] = hc_bytealign_S (in0[2], in0[1], 2); - out1[2] = hc_bytealign_S (in0[1], in0[0], 2); - out1[1] = hc_bytealign_S (in0[0], 0, 2); + case 22: out1[3] = hc_bytealign_S (in0[1], in0[2], 2); + out1[2] = hc_bytealign_S (in0[0], in0[1], 2); + out1[1] = hc_bytealign_S ( 0, in0[0], 2); out1[0] = 0; out0[3] = 0; out0[2] = 0; out0[1] = 0; out0[0] = 0; break; - case 23: out1[3] = hc_bytealign_S (in0[2], in0[1], 1); - out1[2] = hc_bytealign_S (in0[1], in0[0], 1); - out1[1] = hc_bytealign_S (in0[0], 0, 1); + case 23: out1[3] = hc_bytealign_S (in0[1], in0[2], 3); + out1[2] = hc_bytealign_S (in0[0], in0[1], 3); + out1[1] = hc_bytealign_S ( 0, in0[0], 3); out1[0] = 0; out0[3] = 0; out0[2] = 0; @@ -689,8 +681,8 @@ DECLSPEC void rshift_block_optimized_N (const u32 *in0, const u32 *in1, u32 *out out0[1] = 0; out0[0] = 0; break; - case 25: out1[3] = hc_bytealign_S (in0[1], in0[0], 3); - out1[2] = hc_bytealign_S (in0[0], 0, 3); + case 25: out1[3] = hc_bytealign_S (in0[0], in0[1], 1); + out1[2] = hc_bytealign_S ( 0, in0[0], 1); out1[1] = 0; out1[0] = 0; out0[3] = 0; @@ -698,8 +690,8 @@ DECLSPEC void rshift_block_optimized_N (const u32 *in0, const u32 *in1, u32 *out out0[1] = 0; out0[0] = 0; break; - case 26: out1[3] = hc_bytealign_S (in0[1], in0[0], 2); - out1[2] = hc_bytealign_S (in0[0], 0, 2); + case 26: out1[3] = hc_bytealign_S (in0[0], in0[1], 2); + out1[2] = hc_bytealign_S ( 0, in0[0], 2); out1[1] = 0; out1[0] = 0; out0[3] = 0; @@ -707,8 +699,8 @@ DECLSPEC void rshift_block_optimized_N (const u32 *in0, const u32 *in1, u32 *out out0[1] = 0; out0[0] = 0; break; - case 27: out1[3] = hc_bytealign_S (in0[1], in0[0], 1); - out1[2] = hc_bytealign_S (in0[0], 0, 1); + case 27: out1[3] = hc_bytealign_S (in0[0], in0[1], 3); + out1[2] = hc_bytealign_S ( 0, in0[0], 3); out1[1] = 0; out1[0] = 0; out0[3] = 0; @@ -725,7 +717,7 @@ DECLSPEC void rshift_block_optimized_N (const u32 *in0, const u32 *in1, u32 *out out0[1] = 0; out0[0] = 0; break; - case 29: out1[3] = hc_bytealign_S (in0[0], 0, 3); + case 29: out1[3] = hc_bytealign_S ( 0, in0[0], 1); out1[2] = 0; out1[1] = 0; out1[0] = 0; @@ -734,7 +726,7 @@ DECLSPEC void rshift_block_optimized_N (const u32 *in0, const u32 *in1, u32 *out out0[1] = 0; out0[0] = 0; break; - case 30: out1[3] = hc_bytealign_S (in0[0], 0, 2); + case 30: out1[3] = hc_bytealign_S ( 0, in0[0], 2); out1[2] = 0; out1[1] = 0; out1[0] = 0; @@ -743,7 +735,7 @@ DECLSPEC void rshift_block_optimized_N (const u32 *in0, const u32 *in1, u32 *out out0[1] = 0; out0[0] = 0; break; - case 31: out1[3] = hc_bytealign_S (in0[0], 0, 1); + case 31: out1[3] = hc_bytealign_S ( 0, in0[0], 3); out1[2] = 0; out1[1] = 0; out1[0] = 0; @@ -786,21 +778,17 @@ DECLSPEC void append_block8_optimized (const u32 offset, u32 *buf0, u32 *buf1, c u32 s6 = 0; u32 s7 = 0; - #ifdef IS_AMD const int offset_switch = offset / 4; - #else - const int offset_switch = offset / 4; - #endif #if (defined IS_AMD && HAS_VPERM == 0) || defined IS_GENERIC - const u32 src_r00 = hc_swap32_S (src_r0[0]); - const u32 src_r01 = hc_swap32_S (src_r0[1]); - const u32 src_r02 = hc_swap32_S (src_r0[2]); - const u32 src_r03 = hc_swap32_S (src_r0[3]); - const u32 src_r10 = hc_swap32_S (src_r1[0]); - const u32 src_r11 = hc_swap32_S (src_r1[1]); - const u32 src_r12 = hc_swap32_S (src_r1[2]); - const u32 src_r13 = hc_swap32_S (src_r1[3]); + const u32 src_r00 = src_r0[0]; + const u32 src_r01 = src_r0[1]; + const u32 src_r02 = src_r0[2]; + const u32 src_r03 = src_r0[3]; + const u32 src_r10 = src_r1[0]; + const u32 src_r11 = src_r1[1]; + const u32 src_r12 = src_r1[2]; + const u32 src_r13 = src_r1[3]; switch (offset_switch) { @@ -893,15 +881,6 @@ DECLSPEC void append_block8_optimized (const u32 offset, u32 *buf0, u32 *buf1, c s0 = 0; break; } - - s0 = hc_swap32_S (s0); - s1 = hc_swap32_S (s1); - s2 = hc_swap32_S (s2); - s3 = hc_swap32_S (s3); - s4 = hc_swap32_S (s4); - s5 = hc_swap32_S (s5); - s6 = hc_swap32_S (s6); - s7 = hc_swap32_S (s7); #endif #if (defined IS_AMD && HAS_VPERM == 1) || defined IS_NV