From 15a9644f2a7420f803e1af612c4199d2564a9b59 Mon Sep 17 00:00:00 2001 From: philsmd Date: Fri, 3 Feb 2017 00:07:31 +0100 Subject: [PATCH] fixes #943: added -m 15000 = FileZilla Server >= 0.9.55 --- OpenCL/m15000_a0.cl | 2375 +++++++++++++++++++++++++++++ OpenCL/m15000_a1.cl | 2493 +++++++++++++++++++++++++++++++ OpenCL/m15000_a3.cl | 2313 ++++++++++++++++++++++++++++ docs/changes.txt | 1 + docs/readme.txt | 1 + extra/tab_completion/hashcat.sh | 2 +- include/interface.h | 4 + src/interface.c | 76 + src/usage.c | 1 + tools/test.pl | 12 +- tools/test.sh | 2 +- 11 files changed, 7272 insertions(+), 8 deletions(-) create mode 100644 OpenCL/m15000_a0.cl create mode 100644 OpenCL/m15000_a1.cl create mode 100644 OpenCL/m15000_a3.cl diff --git a/OpenCL/m15000_a0.cl b/OpenCL/m15000_a0.cl new file mode 100644 index 000000000..25b75e40a --- /dev/null +++ b/OpenCL/m15000_a0.cl @@ -0,0 +1,2375 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define _SHA512_ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_simd.cl" + +inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) +{ + #if defined IS_AMD || defined IS_GENERIC + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset; + + switch (offset / 4) + { + case 0: + w7[3] = amd_bytealign_S (w7[3], w7[2], offset_minus_4); + w7[2] = amd_bytealign_S (w7[2], w7[1], offset_minus_4); + w7[1] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); + w7[0] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); + w6[3] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); + w6[2] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w6[1] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w6[0] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w5[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w5[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w5[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w5[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w4[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w4[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w4[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w4[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w3[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w3[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + + if (offset_mod_4 == 0) + { + w0[0] = w0[1]; + w0[1] = w0[2]; + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 1: + w7[3] = amd_bytealign_S (w7[2], w7[1], offset_minus_4); + w7[2] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); + w7[1] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); + w7[0] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); + w6[3] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w6[2] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w6[1] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w6[0] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w5[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w5[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w5[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w5[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w4[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w4[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w4[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w4[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w3[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w3[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w0[1] = w0[2]; + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 2: + w7[3] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); + w7[2] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); + w7[1] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); + w7[0] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w6[3] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w6[2] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w6[1] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w6[0] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w5[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w5[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w5[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w5[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w4[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w4[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w4[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w4[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w3[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 3: + w7[3] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); + w7[2] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); + w7[1] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w7[0] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w6[3] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w6[2] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w6[1] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w6[0] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w5[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w5[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w5[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w5[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w4[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w4[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w4[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w4[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 4: + w7[3] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); + w7[2] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w7[1] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w7[0] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w6[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w6[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w6[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w6[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w5[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w5[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w5[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w5[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w4[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w4[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w4[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w4[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 5: + w7[3] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w7[2] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w7[1] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w7[0] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w6[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w6[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w6[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w6[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w5[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w5[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w5[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w5[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w4[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w4[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w4[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w4[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 6: + w7[3] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w7[2] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w7[1] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w7[0] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w6[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w6[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w6[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w6[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w5[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w5[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w5[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w5[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w4[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w4[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w4[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w4[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 7: + w7[3] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w7[2] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w7[1] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w7[0] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w6[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w6[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w6[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w6[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w5[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w5[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w5[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w5[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w4[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w4[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w4[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w4[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 8: + w7[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w7[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w7[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w7[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w6[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w6[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w6[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w6[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w5[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w5[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w5[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w5[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w4[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w4[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w4[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w4[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 9: + w7[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w7[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w7[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w7[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w6[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w6[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w6[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w6[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w5[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w5[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w5[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w5[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w4[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w4[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w4[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w4[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 10: + w7[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w7[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w7[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w7[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w6[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w6[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w6[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w6[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w5[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w5[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w5[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w5[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w4[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w4[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w4[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w4[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 11: + w7[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w7[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w7[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w7[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w6[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w6[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w6[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w6[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w5[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w5[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w5[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w5[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w4[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w4[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w4[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w4[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 12: + w7[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w7[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w7[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w7[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w6[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w6[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w6[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w6[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w5[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w5[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w5[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w5[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w4[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w4[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w4[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w4[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 13: + w7[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w7[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w7[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w7[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w6[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w6[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w6[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w6[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w5[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w5[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w5[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w5[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w4[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w4[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w4[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w4[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 14: + w7[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w7[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w7[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w7[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w6[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w6[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w6[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w6[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w5[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w5[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w5[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w5[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w4[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w4[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w4[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w4[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 15: + w7[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w7[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w7[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w7[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w6[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w6[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w6[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w6[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w5[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w5[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w5[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w5[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w4[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w4[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w4[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w4[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + } + #endif + + #ifdef IS_NV + const int offset_minus_4 = 4 - (offset % 4); + + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + + switch (offset / 4) + { + case 0: + w7[3] = __byte_perm_S (w7[2], w7[3], selector); + w7[2] = __byte_perm_S (w7[1], w7[2], selector); + w7[1] = __byte_perm_S (w7[0], w7[1], selector); + w7[0] = __byte_perm_S (w6[3], w7[0], selector); + w6[3] = __byte_perm_S (w6[2], w6[3], selector); + w6[2] = __byte_perm_S (w6[1], w6[2], selector); + w6[1] = __byte_perm_S (w6[0], w6[1], selector); + w6[0] = __byte_perm_S (w5[3], w6[0], selector); + w5[3] = __byte_perm_S (w5[2], w5[3], selector); + w5[2] = __byte_perm_S (w5[1], w5[2], selector); + w5[1] = __byte_perm_S (w5[0], w5[1], selector); + w5[0] = __byte_perm_S (w4[3], w5[0], selector); + w4[3] = __byte_perm_S (w4[2], w4[3], selector); + w4[2] = __byte_perm_S (w4[1], w4[2], selector); + w4[1] = __byte_perm_S (w4[0], w4[1], selector); + w4[0] = __byte_perm_S (w3[3], w4[0], selector); + w3[3] = __byte_perm_S (w3[2], w3[3], selector); + w3[2] = __byte_perm_S (w3[1], w3[2], selector); + w3[1] = __byte_perm_S (w3[0], w3[1], selector); + w3[0] = __byte_perm_S (w2[3], w3[0], selector); + w2[3] = __byte_perm_S (w2[2], w2[3], selector); + w2[2] = __byte_perm_S (w2[1], w2[2], selector); + w2[1] = __byte_perm_S (w2[0], w2[1], selector); + w2[0] = __byte_perm_S (w1[3], w2[0], selector); + w1[3] = __byte_perm_S (w1[2], w1[3], selector); + w1[2] = __byte_perm_S (w1[1], w1[2], selector); + w1[1] = __byte_perm_S (w1[0], w1[1], selector); + w1[0] = __byte_perm_S (w0[3], w1[0], selector); + w0[3] = __byte_perm_S (w0[2], w0[3], selector); + w0[2] = __byte_perm_S (w0[1], w0[2], selector); + w0[1] = __byte_perm_S (w0[0], w0[1], selector); + w0[0] = __byte_perm_S ( 0, w0[0], selector); + break; + + case 1: + w7[3] = __byte_perm_S (w7[1], w7[2], selector); + w7[2] = __byte_perm_S (w7[0], w7[1], selector); + w7[1] = __byte_perm_S (w6[3], w7[0], selector); + w7[0] = __byte_perm_S (w6[2], w6[3], selector); + w6[3] = __byte_perm_S (w6[1], w6[2], selector); + w6[2] = __byte_perm_S (w6[0], w6[1], selector); + w6[1] = __byte_perm_S (w5[3], w6[0], selector); + w6[0] = __byte_perm_S (w5[2], w5[3], selector); + w5[3] = __byte_perm_S (w5[1], w5[2], selector); + w5[2] = __byte_perm_S (w5[0], w5[1], selector); + w5[1] = __byte_perm_S (w4[3], w5[0], selector); + w5[0] = __byte_perm_S (w4[2], w4[3], selector); + w4[3] = __byte_perm_S (w4[1], w4[2], selector); + w4[2] = __byte_perm_S (w4[0], w4[1], selector); + w4[1] = __byte_perm_S (w3[3], w4[0], selector); + w4[0] = __byte_perm_S (w3[2], w3[3], selector); + w3[3] = __byte_perm_S (w3[1], w3[2], selector); + w3[2] = __byte_perm_S (w3[0], w3[1], selector); + w3[1] = __byte_perm_S (w2[3], w3[0], selector); + w3[0] = __byte_perm_S (w2[2], w2[3], selector); + w2[3] = __byte_perm_S (w2[1], w2[2], selector); + w2[2] = __byte_perm_S (w2[0], w2[1], selector); + w2[1] = __byte_perm_S (w1[3], w2[0], selector); + w2[0] = __byte_perm_S (w1[2], w1[3], selector); + w1[3] = __byte_perm_S (w1[1], w1[2], selector); + w1[2] = __byte_perm_S (w1[0], w1[1], selector); + w1[1] = __byte_perm_S (w0[3], w1[0], selector); + w1[0] = __byte_perm_S (w0[2], w0[3], selector); + w0[3] = __byte_perm_S (w0[1], w0[2], selector); + w0[2] = __byte_perm_S (w0[0], w0[1], selector); + w0[1] = __byte_perm_S ( 0, w0[0], selector); + w0[0] = 0; + break; + + case 2: + w7[3] = __byte_perm_S (w7[0], w7[1], selector); + w7[2] = __byte_perm_S (w6[3], w7[0], selector); + w7[1] = __byte_perm_S (w6[2], w6[3], selector); + w7[0] = __byte_perm_S (w6[1], w6[2], selector); + w6[3] = __byte_perm_S (w6[0], w6[1], selector); + w6[2] = __byte_perm_S (w5[3], w6[0], selector); + w6[1] = __byte_perm_S (w5[2], w5[3], selector); + w6[0] = __byte_perm_S (w5[1], w5[2], selector); + w5[3] = __byte_perm_S (w5[0], w5[1], selector); + w5[2] = __byte_perm_S (w4[3], w5[0], selector); + w5[1] = __byte_perm_S (w4[2], w4[3], selector); + w5[0] = __byte_perm_S (w4[1], w4[2], selector); + w4[3] = __byte_perm_S (w4[0], w4[1], selector); + w4[2] = __byte_perm_S (w3[3], w4[0], selector); + w4[1] = __byte_perm_S (w3[2], w3[3], selector); + w4[0] = __byte_perm_S (w3[1], w3[2], selector); + w3[3] = __byte_perm_S (w3[0], w3[1], selector); + w3[2] = __byte_perm_S (w2[3], w3[0], selector); + w3[1] = __byte_perm_S (w2[2], w2[3], selector); + w3[0] = __byte_perm_S (w2[1], w2[2], selector); + w2[3] = __byte_perm_S (w2[0], w2[1], selector); + w2[2] = __byte_perm_S (w1[3], w2[0], selector); + w2[1] = __byte_perm_S (w1[2], w1[3], selector); + w2[0] = __byte_perm_S (w1[1], w1[2], selector); + w1[3] = __byte_perm_S (w1[0], w1[1], selector); + w1[2] = __byte_perm_S (w0[3], w1[0], selector); + w1[1] = __byte_perm_S (w0[2], w0[3], selector); + w1[0] = __byte_perm_S (w0[1], w0[2], selector); + w0[3] = __byte_perm_S (w0[0], w0[1], selector); + w0[2] = __byte_perm_S ( 0, w0[0], selector); + w0[1] = 0; + w0[0] = 0; + break; + + case 3: + w7[3] = __byte_perm_S (w6[3], w7[0], selector); + w7[2] = __byte_perm_S (w6[2], w6[3], selector); + w7[1] = __byte_perm_S (w6[1], w6[2], selector); + w7[0] = __byte_perm_S (w6[0], w6[1], selector); + w6[3] = __byte_perm_S (w5[3], w6[0], selector); + w6[2] = __byte_perm_S (w5[2], w5[3], selector); + w6[1] = __byte_perm_S (w5[1], w5[2], selector); + w6[0] = __byte_perm_S (w5[0], w5[1], selector); + w5[3] = __byte_perm_S (w4[3], w5[0], selector); + w5[2] = __byte_perm_S (w4[2], w4[3], selector); + w5[1] = __byte_perm_S (w4[1], w4[2], selector); + w5[0] = __byte_perm_S (w4[0], w4[1], selector); + w4[3] = __byte_perm_S (w3[3], w4[0], selector); + w4[2] = __byte_perm_S (w3[2], w3[3], selector); + w4[1] = __byte_perm_S (w3[1], w3[2], selector); + w4[0] = __byte_perm_S (w3[0], w3[1], selector); + w3[3] = __byte_perm_S (w2[3], w3[0], selector); + w3[2] = __byte_perm_S (w2[2], w2[3], selector); + w3[1] = __byte_perm_S (w2[1], w2[2], selector); + w3[0] = __byte_perm_S (w2[0], w2[1], selector); + w2[3] = __byte_perm_S (w1[3], w2[0], selector); + w2[2] = __byte_perm_S (w1[2], w1[3], selector); + w2[1] = __byte_perm_S (w1[1], w1[2], selector); + w2[0] = __byte_perm_S (w1[0], w1[1], selector); + w1[3] = __byte_perm_S (w0[3], w1[0], selector); + w1[2] = __byte_perm_S (w0[2], w0[3], selector); + w1[1] = __byte_perm_S (w0[1], w0[2], selector); + w1[0] = __byte_perm_S (w0[0], w0[1], selector); + w0[3] = __byte_perm_S ( 0, w0[0], selector); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 4: + w7[3] = __byte_perm_S (w6[2], w6[3], selector); + w7[2] = __byte_perm_S (w6[1], w6[2], selector); + w7[1] = __byte_perm_S (w6[0], w6[1], selector); + w7[0] = __byte_perm_S (w5[3], w6[0], selector); + w6[3] = __byte_perm_S (w5[2], w5[3], selector); + w6[2] = __byte_perm_S (w5[1], w5[2], selector); + w6[1] = __byte_perm_S (w5[0], w5[1], selector); + w6[0] = __byte_perm_S (w4[3], w5[0], selector); + w5[3] = __byte_perm_S (w4[2], w4[3], selector); + w5[2] = __byte_perm_S (w4[1], w4[2], selector); + w5[1] = __byte_perm_S (w4[0], w4[1], selector); + w5[0] = __byte_perm_S (w3[3], w4[0], selector); + w4[3] = __byte_perm_S (w3[2], w3[3], selector); + w4[2] = __byte_perm_S (w3[1], w3[2], selector); + w4[1] = __byte_perm_S (w3[0], w3[1], selector); + w4[0] = __byte_perm_S (w2[3], w3[0], selector); + w3[3] = __byte_perm_S (w2[2], w2[3], selector); + w3[2] = __byte_perm_S (w2[1], w2[2], selector); + w3[1] = __byte_perm_S (w2[0], w2[1], selector); + w3[0] = __byte_perm_S (w1[3], w2[0], selector); + w2[3] = __byte_perm_S (w1[2], w1[3], selector); + w2[2] = __byte_perm_S (w1[1], w1[2], selector); + w2[1] = __byte_perm_S (w1[0], w1[1], selector); + w2[0] = __byte_perm_S (w0[3], w1[0], selector); + w1[3] = __byte_perm_S (w0[2], w0[3], selector); + w1[2] = __byte_perm_S (w0[1], w0[2], selector); + w1[1] = __byte_perm_S (w0[0], w0[1], selector); + w1[0] = __byte_perm_S ( 0, w0[0], selector); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 5: + w7[3] = __byte_perm_S (w6[1], w6[2], selector); + w7[2] = __byte_perm_S (w6[0], w6[1], selector); + w7[1] = __byte_perm_S (w5[3], w6[0], selector); + w7[0] = __byte_perm_S (w5[2], w5[3], selector); + w6[3] = __byte_perm_S (w5[1], w5[2], selector); + w6[2] = __byte_perm_S (w5[0], w5[1], selector); + w6[1] = __byte_perm_S (w4[3], w5[0], selector); + w6[0] = __byte_perm_S (w4[2], w4[3], selector); + w5[3] = __byte_perm_S (w4[1], w4[2], selector); + w5[2] = __byte_perm_S (w4[0], w4[1], selector); + w5[1] = __byte_perm_S (w3[3], w4[0], selector); + w5[0] = __byte_perm_S (w3[2], w3[3], selector); + w4[3] = __byte_perm_S (w3[1], w3[2], selector); + w4[2] = __byte_perm_S (w3[0], w3[1], selector); + w4[1] = __byte_perm_S (w2[3], w3[0], selector); + w4[0] = __byte_perm_S (w2[2], w2[3], selector); + w3[3] = __byte_perm_S (w2[1], w2[2], selector); + w3[2] = __byte_perm_S (w2[0], w2[1], selector); + w3[1] = __byte_perm_S (w1[3], w2[0], selector); + w3[0] = __byte_perm_S (w1[2], w1[3], selector); + w2[3] = __byte_perm_S (w1[1], w1[2], selector); + w2[2] = __byte_perm_S (w1[0], w1[1], selector); + w2[1] = __byte_perm_S (w0[3], w1[0], selector); + w2[0] = __byte_perm_S (w0[2], w0[3], selector); + w1[3] = __byte_perm_S (w0[1], w0[2], selector); + w1[2] = __byte_perm_S (w0[0], w0[1], selector); + w1[1] = __byte_perm_S ( 0, w0[0], selector); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 6: + w7[3] = __byte_perm_S (w6[0], w6[1], selector); + w7[2] = __byte_perm_S (w5[3], w6[0], selector); + w7[1] = __byte_perm_S (w5[2], w5[3], selector); + w7[0] = __byte_perm_S (w5[1], w5[2], selector); + w6[3] = __byte_perm_S (w5[0], w5[1], selector); + w6[2] = __byte_perm_S (w4[3], w5[0], selector); + w6[1] = __byte_perm_S (w4[2], w4[3], selector); + w6[0] = __byte_perm_S (w4[1], w4[2], selector); + w5[3] = __byte_perm_S (w4[0], w4[1], selector); + w5[2] = __byte_perm_S (w3[3], w4[0], selector); + w5[1] = __byte_perm_S (w3[2], w3[3], selector); + w5[0] = __byte_perm_S (w3[1], w3[2], selector); + w4[3] = __byte_perm_S (w3[0], w3[1], selector); + w4[2] = __byte_perm_S (w2[3], w3[0], selector); + w4[1] = __byte_perm_S (w2[2], w2[3], selector); + w4[0] = __byte_perm_S (w2[1], w2[2], selector); + w3[3] = __byte_perm_S (w2[0], w2[1], selector); + w3[2] = __byte_perm_S (w1[3], w2[0], selector); + w3[1] = __byte_perm_S (w1[2], w1[3], selector); + w3[0] = __byte_perm_S (w1[1], w1[2], selector); + w2[3] = __byte_perm_S (w1[0], w1[1], selector); + w2[2] = __byte_perm_S (w0[3], w1[0], selector); + w2[1] = __byte_perm_S (w0[2], w0[3], selector); + w2[0] = __byte_perm_S (w0[1], w0[2], selector); + w1[3] = __byte_perm_S (w0[0], w0[1], selector); + w1[2] = __byte_perm_S ( 0, w0[0], selector); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 7: + w7[3] = __byte_perm_S (w5[3], w6[0], selector); + w7[2] = __byte_perm_S (w5[2], w5[3], selector); + w7[1] = __byte_perm_S (w5[1], w5[2], selector); + w7[0] = __byte_perm_S (w5[0], w5[1], selector); + w6[3] = __byte_perm_S (w4[3], w5[0], selector); + w6[2] = __byte_perm_S (w4[2], w4[3], selector); + w6[1] = __byte_perm_S (w4[1], w4[2], selector); + w6[0] = __byte_perm_S (w4[0], w4[1], selector); + w5[3] = __byte_perm_S (w3[3], w4[0], selector); + w5[2] = __byte_perm_S (w3[2], w3[3], selector); + w5[1] = __byte_perm_S (w3[1], w3[2], selector); + w5[0] = __byte_perm_S (w3[0], w3[1], selector); + w4[3] = __byte_perm_S (w2[3], w3[0], selector); + w4[2] = __byte_perm_S (w2[2], w2[3], selector); + w4[1] = __byte_perm_S (w2[1], w2[2], selector); + w4[0] = __byte_perm_S (w2[0], w2[1], selector); + w3[3] = __byte_perm_S (w1[3], w2[0], selector); + w3[2] = __byte_perm_S (w1[2], w1[3], selector); + w3[1] = __byte_perm_S (w1[1], w1[2], selector); + w3[0] = __byte_perm_S (w1[0], w1[1], selector); + w2[3] = __byte_perm_S (w0[3], w1[0], selector); + w2[2] = __byte_perm_S (w0[2], w0[3], selector); + w2[1] = __byte_perm_S (w0[1], w0[2], selector); + w2[0] = __byte_perm_S (w0[0], w0[1], selector); + w1[3] = __byte_perm_S ( 0, w0[0], selector); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 8: + w7[3] = __byte_perm_S (w5[2], w5[3], selector); + w7[2] = __byte_perm_S (w5[1], w5[2], selector); + w7[1] = __byte_perm_S (w5[0], w5[1], selector); + w7[0] = __byte_perm_S (w4[3], w5[0], selector); + w6[3] = __byte_perm_S (w4[2], w4[3], selector); + w6[2] = __byte_perm_S (w4[1], w4[2], selector); + w6[1] = __byte_perm_S (w4[0], w4[1], selector); + w6[0] = __byte_perm_S (w3[3], w4[0], selector); + w5[3] = __byte_perm_S (w3[2], w3[3], selector); + w5[2] = __byte_perm_S (w3[1], w3[2], selector); + w5[1] = __byte_perm_S (w3[0], w3[1], selector); + w5[0] = __byte_perm_S (w2[3], w3[0], selector); + w4[3] = __byte_perm_S (w2[2], w2[3], selector); + w4[2] = __byte_perm_S (w2[1], w2[2], selector); + w4[1] = __byte_perm_S (w2[0], w2[1], selector); + w4[0] = __byte_perm_S (w1[3], w2[0], selector); + w3[3] = __byte_perm_S (w1[2], w1[3], selector); + w3[2] = __byte_perm_S (w1[1], w1[2], selector); + w3[1] = __byte_perm_S (w1[0], w1[1], selector); + w3[0] = __byte_perm_S (w0[3], w1[0], selector); + w2[3] = __byte_perm_S (w0[2], w0[3], selector); + w2[2] = __byte_perm_S (w0[1], w0[2], selector); + w2[1] = __byte_perm_S (w0[0], w0[1], selector); + w2[0] = __byte_perm_S ( 0, w0[0], selector); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 9: + w7[3] = __byte_perm_S (w5[1], w5[2], selector); + w7[2] = __byte_perm_S (w5[0], w5[1], selector); + w7[1] = __byte_perm_S (w4[3], w5[0], selector); + w7[0] = __byte_perm_S (w4[2], w4[3], selector); + w6[3] = __byte_perm_S (w4[1], w4[2], selector); + w6[2] = __byte_perm_S (w4[0], w4[1], selector); + w6[1] = __byte_perm_S (w3[3], w4[0], selector); + w6[0] = __byte_perm_S (w3[2], w3[3], selector); + w5[3] = __byte_perm_S (w3[1], w3[2], selector); + w5[2] = __byte_perm_S (w3[0], w3[1], selector); + w5[1] = __byte_perm_S (w2[3], w3[0], selector); + w5[0] = __byte_perm_S (w2[2], w2[3], selector); + w4[3] = __byte_perm_S (w2[1], w2[2], selector); + w4[2] = __byte_perm_S (w2[0], w2[1], selector); + w4[1] = __byte_perm_S (w1[3], w2[0], selector); + w4[0] = __byte_perm_S (w1[2], w1[3], selector); + w3[3] = __byte_perm_S (w1[1], w1[2], selector); + w3[2] = __byte_perm_S (w1[0], w1[1], selector); + w3[1] = __byte_perm_S (w0[3], w1[0], selector); + w3[0] = __byte_perm_S (w0[2], w0[3], selector); + w2[3] = __byte_perm_S (w0[1], w0[2], selector); + w2[2] = __byte_perm_S (w0[0], w0[1], selector); + w2[1] = __byte_perm_S ( 0, w0[0], selector); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 10: + w7[3] = __byte_perm_S (w5[0], w5[1], selector); + w7[2] = __byte_perm_S (w4[3], w5[0], selector); + w7[1] = __byte_perm_S (w4[2], w4[3], selector); + w7[0] = __byte_perm_S (w4[1], w4[2], selector); + w6[3] = __byte_perm_S (w4[0], w4[1], selector); + w6[2] = __byte_perm_S (w3[3], w4[0], selector); + w6[1] = __byte_perm_S (w3[2], w3[3], selector); + w6[0] = __byte_perm_S (w3[1], w3[2], selector); + w5[3] = __byte_perm_S (w3[0], w3[1], selector); + w5[2] = __byte_perm_S (w2[3], w3[0], selector); + w5[1] = __byte_perm_S (w2[2], w2[3], selector); + w5[0] = __byte_perm_S (w2[1], w2[2], selector); + w4[3] = __byte_perm_S (w2[0], w2[1], selector); + w4[2] = __byte_perm_S (w1[3], w2[0], selector); + w4[1] = __byte_perm_S (w1[2], w1[3], selector); + w4[0] = __byte_perm_S (w1[1], w1[2], selector); + w3[3] = __byte_perm_S (w1[0], w1[1], selector); + w3[2] = __byte_perm_S (w0[3], w1[0], selector); + w3[1] = __byte_perm_S (w0[2], w0[3], selector); + w3[0] = __byte_perm_S (w0[1], w0[2], selector); + w2[3] = __byte_perm_S (w0[0], w0[1], selector); + w2[2] = __byte_perm_S ( 0, w0[0], selector); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 11: + w7[3] = __byte_perm_S (w4[3], w5[0], selector); + w7[2] = __byte_perm_S (w4[2], w4[3], selector); + w7[1] = __byte_perm_S (w4[1], w4[2], selector); + w7[0] = __byte_perm_S (w4[0], w4[1], selector); + w6[3] = __byte_perm_S (w3[3], w4[0], selector); + w6[2] = __byte_perm_S (w3[2], w3[3], selector); + w6[1] = __byte_perm_S (w3[1], w3[2], selector); + w6[0] = __byte_perm_S (w3[0], w3[1], selector); + w5[3] = __byte_perm_S (w2[3], w3[0], selector); + w5[2] = __byte_perm_S (w2[2], w2[3], selector); + w5[1] = __byte_perm_S (w2[1], w2[2], selector); + w5[0] = __byte_perm_S (w2[0], w2[1], selector); + w4[3] = __byte_perm_S (w1[3], w2[0], selector); + w4[2] = __byte_perm_S (w1[2], w1[3], selector); + w4[1] = __byte_perm_S (w1[1], w1[2], selector); + w4[0] = __byte_perm_S (w1[0], w1[1], selector); + w3[3] = __byte_perm_S (w0[3], w1[0], selector); + w3[2] = __byte_perm_S (w0[2], w0[3], selector); + w3[1] = __byte_perm_S (w0[1], w0[2], selector); + w3[0] = __byte_perm_S (w0[0], w0[1], selector); + w2[3] = __byte_perm_S ( 0, w0[0], selector); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 12: + w7[3] = __byte_perm_S (w4[2], w4[3], selector); + w7[2] = __byte_perm_S (w4[1], w4[2], selector); + w7[1] = __byte_perm_S (w4[0], w4[1], selector); + w7[0] = __byte_perm_S (w3[3], w4[0], selector); + w6[3] = __byte_perm_S (w3[2], w3[3], selector); + w6[2] = __byte_perm_S (w3[1], w3[2], selector); + w6[1] = __byte_perm_S (w3[0], w3[1], selector); + w6[0] = __byte_perm_S (w2[3], w3[0], selector); + w5[3] = __byte_perm_S (w2[2], w2[3], selector); + w5[2] = __byte_perm_S (w2[1], w2[2], selector); + w5[1] = __byte_perm_S (w2[0], w2[1], selector); + w5[0] = __byte_perm_S (w1[3], w2[0], selector); + w4[3] = __byte_perm_S (w1[2], w1[3], selector); + w4[2] = __byte_perm_S (w1[1], w1[2], selector); + w4[1] = __byte_perm_S (w1[0], w1[1], selector); + w4[0] = __byte_perm_S (w0[3], w1[0], selector); + w3[3] = __byte_perm_S (w0[2], w0[3], selector); + w3[2] = __byte_perm_S (w0[1], w0[2], selector); + w3[1] = __byte_perm_S (w0[0], w0[1], selector); + w3[0] = __byte_perm_S ( 0, w0[0], selector); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 13: + w7[3] = __byte_perm_S (w4[1], w4[2], selector); + w7[2] = __byte_perm_S (w4[0], w4[1], selector); + w7[1] = __byte_perm_S (w3[3], w4[0], selector); + w7[0] = __byte_perm_S (w3[2], w3[3], selector); + w6[3] = __byte_perm_S (w3[1], w3[2], selector); + w6[2] = __byte_perm_S (w3[0], w3[1], selector); + w6[1] = __byte_perm_S (w2[3], w3[0], selector); + w6[0] = __byte_perm_S (w2[2], w2[3], selector); + w5[3] = __byte_perm_S (w2[1], w2[2], selector); + w5[2] = __byte_perm_S (w2[0], w2[1], selector); + w5[1] = __byte_perm_S (w1[3], w2[0], selector); + w5[0] = __byte_perm_S (w1[2], w1[3], selector); + w4[3] = __byte_perm_S (w1[1], w1[2], selector); + w4[2] = __byte_perm_S (w1[0], w1[1], selector); + w4[1] = __byte_perm_S (w0[3], w1[0], selector); + w4[0] = __byte_perm_S (w0[2], w0[3], selector); + w3[3] = __byte_perm_S (w0[1], w0[2], selector); + w3[2] = __byte_perm_S (w0[0], w0[1], selector); + w3[1] = __byte_perm_S ( 0, w0[0], selector); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 14: + w7[3] = __byte_perm_S (w4[0], w4[1], selector); + w7[2] = __byte_perm_S (w3[3], w4[0], selector); + w7[1] = __byte_perm_S (w3[2], w3[3], selector); + w7[0] = __byte_perm_S (w3[1], w3[2], selector); + w6[3] = __byte_perm_S (w3[0], w3[1], selector); + w6[2] = __byte_perm_S (w2[3], w3[0], selector); + w6[1] = __byte_perm_S (w2[2], w2[3], selector); + w6[0] = __byte_perm_S (w2[1], w2[2], selector); + w5[3] = __byte_perm_S (w2[0], w2[1], selector); + w5[2] = __byte_perm_S (w1[3], w2[0], selector); + w5[1] = __byte_perm_S (w1[2], w1[3], selector); + w5[0] = __byte_perm_S (w1[1], w1[2], selector); + w4[3] = __byte_perm_S (w1[0], w1[1], selector); + w4[2] = __byte_perm_S (w0[3], w1[0], selector); + w4[1] = __byte_perm_S (w0[2], w0[3], selector); + w4[0] = __byte_perm_S (w0[1], w0[2], selector); + w3[3] = __byte_perm_S (w0[0], w0[1], selector); + w3[2] = __byte_perm_S ( 0, w0[0], selector); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 15: + w7[3] = __byte_perm_S (w3[3], w4[0], selector); + w7[2] = __byte_perm_S (w3[2], w3[3], selector); + w7[1] = __byte_perm_S (w3[1], w3[2], selector); + w7[0] = __byte_perm_S (w3[0], w3[1], selector); + w6[3] = __byte_perm_S (w2[3], w3[0], selector); + w6[2] = __byte_perm_S (w2[2], w2[3], selector); + w6[1] = __byte_perm_S (w2[1], w2[2], selector); + w6[0] = __byte_perm_S (w2[0], w2[1], selector); + w5[3] = __byte_perm_S (w1[3], w2[0], selector); + w5[2] = __byte_perm_S (w1[2], w1[3], selector); + w5[1] = __byte_perm_S (w1[1], w1[2], selector); + w5[0] = __byte_perm_S (w1[0], w1[1], selector); + w4[3] = __byte_perm_S (w0[3], w1[0], selector); + w4[2] = __byte_perm_S (w0[2], w0[3], selector); + w4[1] = __byte_perm_S (w0[1], w0[2], selector); + w4[0] = __byte_perm_S (w0[0], w0[1], selector); + w3[3] = __byte_perm_S ( 0, w0[0], selector); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + } + #endif +} + +#define PACKVS84(s0,s1,s2,s3,s4,s5,s6,s7,v0,v1,v2,v3,v4,v5,v6,v7,e) \ + PACKVS4 (s0, v0, e); \ + PACKVS4 (s1, v1, e); \ + PACKVS4 (s2, v2, e); \ + PACKVS4 (s3, v3, e); \ + PACKVS4 (s4, v4, e); \ + PACKVS4 (s5, v5, e); \ + PACKVS4 (s6, v6, e); \ + PACKVS4 (s7, v7, e); + +#define PACKSV84(s0,s1,s2,s3,s4,s5,s6,s7,v0,v1,v2,v3,v4,v5,v6,v7,e) \ + PACKSV4 (s0, v0, e); \ + PACKSV4 (s1, v1, e); \ + PACKSV4 (s2, v2, e); \ + PACKSV4 (s3, v3, e); \ + PACKSV4 (s4, v4, e); \ + PACKSV4 (s5, v5, e); \ + PACKSV4 (s6, v6, e); \ + PACKSV4 (s7, v7, e); + +inline void switch_buffer_by_offset_8x4_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32x offset) +{ + #if VECT_SIZE == 1 + + switch_buffer_by_offset_8x4_le_S (w0, w1, w2, w3, w4, w5, w6, w7, offset); + + #else + + u32 t0[4]; + u32 t1[4]; + u32 t2[4]; + u32 t3[4]; + u32 t4[4]; + u32 t5[4]; + u32 t6[4]; + u32 t7[4]; + + #endif + + #if VECT_SIZE == 2 + + // 1 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s0); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); + + // 2 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s1); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); + + #elif VECT_SIZE == 4 + + // 1 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s0); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); + + // 2 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s1); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); + + // 3 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s2); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); + + // 4 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s3); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); + + #elif VECT_SIZE == 8 + + // 1 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s0); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); + + // 2 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s1); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); + + // 3 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s2); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); + + // 4 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s3); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); + + // 5 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 4); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s4); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 4); + + // 6 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 5); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s5); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 5); + + // 7 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 6); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s6); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 6); + + // 8 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 7); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s7); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 7); + + #elif VECT_SIZE == 16 + + // 1 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s0); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); + + // 2 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s1); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); + + // 3 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s2); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); + + // 4 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s3); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); + + // 5 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 4); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s4); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 4); + + // 6 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 5); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s5); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 5); + + // 7 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 6); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s6); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 6); + + // 8 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 7); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s7); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 7); + + // 9 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 8); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s8); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 8); + + // 10 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 9); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s9); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 9); + + // 11 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, a); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sa); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, a); + + // 12 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, b); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sb); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, b); + + // 13 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, c); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sc); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, c); + + // 14 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, d); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sd); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, d); + + // 15 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, e); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.se); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, e); + + // 16 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, f); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sf); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, f); + + #endif +} + +__constant u64 k_sha512[80] = +{ + SHA512C00, SHA512C01, SHA512C02, SHA512C03, + SHA512C04, SHA512C05, SHA512C06, SHA512C07, + SHA512C08, SHA512C09, SHA512C0a, SHA512C0b, + SHA512C0c, SHA512C0d, SHA512C0e, SHA512C0f, + SHA512C10, SHA512C11, SHA512C12, SHA512C13, + SHA512C14, SHA512C15, SHA512C16, SHA512C17, + SHA512C18, SHA512C19, SHA512C1a, SHA512C1b, + SHA512C1c, SHA512C1d, SHA512C1e, SHA512C1f, + SHA512C20, SHA512C21, SHA512C22, SHA512C23, + SHA512C24, SHA512C25, SHA512C26, SHA512C27, + SHA512C28, SHA512C29, SHA512C2a, SHA512C2b, + SHA512C2c, SHA512C2d, SHA512C2e, SHA512C2f, + SHA512C30, SHA512C31, SHA512C32, SHA512C33, + SHA512C34, SHA512C35, SHA512C36, SHA512C37, + SHA512C38, SHA512C39, SHA512C3a, SHA512C3b, + SHA512C3c, SHA512C3d, SHA512C3e, SHA512C3f, + SHA512C40, SHA512C41, SHA512C42, SHA512C43, + SHA512C44, SHA512C45, SHA512C46, SHA512C47, + SHA512C48, SHA512C49, SHA512C4a, SHA512C4b, + SHA512C4c, SHA512C4d, SHA512C4e, SHA512C4f, +}; + +static void sha512_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], const u32x w4[4], const u32x w5[4], const u32x w6[4], const u32x w7[4], u64x digest[8]) +{ + u64x w0_t = hl32_to_64 (w0[0], w0[1]); + u64x w1_t = hl32_to_64 (w0[2], w0[3]); + u64x w2_t = hl32_to_64 (w1[0], w1[1]); + u64x w3_t = hl32_to_64 (w1[2], w1[3]); + u64x w4_t = hl32_to_64 (w2[0], w2[1]); + u64x w5_t = hl32_to_64 (w2[2], w2[3]); + u64x w6_t = hl32_to_64 (w3[0], w3[1]); + u64x w7_t = hl32_to_64 (w3[2], w3[3]); + u64x w8_t = hl32_to_64 (w4[0], w4[1]); + u64x w9_t = hl32_to_64 (w4[2], w4[3]); + u64x wa_t = hl32_to_64 (w5[0], w5[1]); + u64x wb_t = hl32_to_64 (w5[2], w5[3]); + u64x wc_t = hl32_to_64 (w6[0], w6[1]); + u64x wd_t = hl32_to_64 (w6[2], w6[3]); + u64x we_t = hl32_to_64 (w7[0], w7[1]); + u64x wf_t = hl32_to_64 (w7[2], w7[3]); + + u64x a = digest[0]; + u64x b = digest[1]; + u64x c = digest[2]; + u64x d = digest[3]; + u64x e = digest[4]; + u64x f = digest[5]; + u64x g = digest[6]; + u64x h = digest[7]; + + #define ROUND_EXPAND() \ + { \ + w0_t = SHA512_EXPAND (we_t, w9_t, w1_t, w0_t); \ + w1_t = SHA512_EXPAND (wf_t, wa_t, w2_t, w1_t); \ + w2_t = SHA512_EXPAND (w0_t, wb_t, w3_t, w2_t); \ + w3_t = SHA512_EXPAND (w1_t, wc_t, w4_t, w3_t); \ + w4_t = SHA512_EXPAND (w2_t, wd_t, w5_t, w4_t); \ + w5_t = SHA512_EXPAND (w3_t, we_t, w6_t, w5_t); \ + w6_t = SHA512_EXPAND (w4_t, wf_t, w7_t, w6_t); \ + w7_t = SHA512_EXPAND (w5_t, w0_t, w8_t, w7_t); \ + w8_t = SHA512_EXPAND (w6_t, w1_t, w9_t, w8_t); \ + w9_t = SHA512_EXPAND (w7_t, w2_t, wa_t, w9_t); \ + wa_t = SHA512_EXPAND (w8_t, w3_t, wb_t, wa_t); \ + wb_t = SHA512_EXPAND (w9_t, w4_t, wc_t, wb_t); \ + wc_t = SHA512_EXPAND (wa_t, w5_t, wd_t, wc_t); \ + wd_t = SHA512_EXPAND (wb_t, w6_t, we_t, wd_t); \ + we_t = SHA512_EXPAND (wc_t, w7_t, wf_t, we_t); \ + wf_t = SHA512_EXPAND (wd_t, w8_t, w0_t, wf_t); \ + } + + #define ROUND_STEP(i) \ + { \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, a, b, c, d, e, f, g, h, w0_t, k_sha512[i + 0]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, h, a, b, c, d, e, f, g, w1_t, k_sha512[i + 1]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, g, h, a, b, c, d, e, f, w2_t, k_sha512[i + 2]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, f, g, h, a, b, c, d, e, w3_t, k_sha512[i + 3]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, e, f, g, h, a, b, c, d, w4_t, k_sha512[i + 4]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, d, e, f, g, h, a, b, c, w5_t, k_sha512[i + 5]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, c, d, e, f, g, h, a, b, w6_t, k_sha512[i + 6]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, b, c, d, e, f, g, h, a, w7_t, k_sha512[i + 7]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, a, b, c, d, e, f, g, h, w8_t, k_sha512[i + 8]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, h, a, b, c, d, e, f, g, w9_t, k_sha512[i + 9]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, g, h, a, b, c, d, e, f, wa_t, k_sha512[i + 10]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, f, g, h, a, b, c, d, e, wb_t, k_sha512[i + 11]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, e, f, g, h, a, b, c, d, wc_t, k_sha512[i + 12]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, d, e, f, g, h, a, b, c, wd_t, k_sha512[i + 13]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, c, d, e, f, g, h, a, b, we_t, k_sha512[i + 14]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, b, c, d, e, f, g, h, a, wf_t, k_sha512[i + 15]); \ + } + + ROUND_STEP (0); + + #ifdef _unroll + #pragma unroll + #endif + for (int i = 16; i < 80; i += 16) + { + ROUND_EXPAND (); ROUND_STEP (i); + } + + /* rev + digest[0] += a; + digest[1] += b; + digest[2] += c; + digest[3] += d; + digest[4] += e; + digest[5] += f; + digest[6] += g; + digest[7] += h; + */ + + digest[0] = a; + digest[1] = b; + digest[2] = c; + digest[3] = d; + digest[4] = e; + digest[5] = f; + digest[6] = g; + digest[7] = h; +} + +__kernel void m15000_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const comb_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + + /** + * base + */ + + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_len = pws[gid].pw_len; + + /** + * salt + */ + + u32 salt_buf0[4]; + u32 salt_buf1[4]; + u32 salt_buf2[4]; + u32 salt_buf3[4]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; + salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; + salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; + salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; + salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; + salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; + salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; + salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; + salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + u32x w0[4] = { 0 }; + u32x w1[4] = { 0 }; + u32x w2[4] = { 0 }; + u32x w3[4] = { 0 }; + + const u32x out_len = apply_rules_vect (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); + + /** + * append salt + */ + + const u32x pw_salt_len = out_len + salt_len; + + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; + u32x w4_t[4]; + u32x w5_t[4]; + u32x w6_t[4]; + u32x w7_t[4]; + + w0_t[0] = salt_buf0[0]; + w0_t[1] = salt_buf0[1]; + w0_t[2] = salt_buf0[2]; + w0_t[3] = salt_buf0[3]; + w1_t[0] = salt_buf1[0]; + w1_t[1] = salt_buf1[1]; + w1_t[2] = salt_buf1[2]; + w1_t[3] = salt_buf1[3]; + w2_t[0] = salt_buf2[0]; + w2_t[1] = salt_buf2[1]; + w2_t[2] = salt_buf2[2]; + w2_t[3] = salt_buf2[3]; + w3_t[0] = salt_buf3[0]; + w3_t[1] = salt_buf3[1]; + w3_t[2] = salt_buf3[2]; + w3_t[3] = salt_buf3[3]; + w4_t[0] = 0x80; + w4_t[1] = 0; + w4_t[2] = 0; + w4_t[3] = 0; + w5_t[0] = 0; + w5_t[1] = 0; + w5_t[2] = 0; + w5_t[3] = 0; + w6_t[0] = 0; + w6_t[1] = 0; + w6_t[2] = 0; + w6_t[3] = 0; + w7_t[0] = 0; + w7_t[1] = 0; + w7_t[2] = 0; + w7_t[3] = 0; + + switch_buffer_by_offset_8x4_le_VV (w0_t, w1_t, w2_t, w3_t, w4_t, w5_t, w6_t, w7_t, out_len); + + w0_t[0] |= w0[0]; + w0_t[1] |= w0[1]; + w0_t[2] |= w0[2]; + w0_t[3] |= w0[3]; + w1_t[0] |= w1[0]; + w1_t[1] |= w1[1]; + w1_t[2] |= w1[2]; + w1_t[3] |= w1[3]; + w2_t[0] |= w2[0]; + w2_t[1] |= w2[1]; + w2_t[2] |= w2[2]; + w2_t[3] |= w2[3]; + w3_t[0] |= w3[0]; + w3_t[1] |= w3[1]; + w3_t[2] |= w3[2]; + w3_t[3] |= w3[3]; + + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); + w3_t[2] = swap32 (w3_t[2]); + w3_t[3] = swap32 (w3_t[3]); + w4_t[0] = swap32 (w4_t[0]); + w4_t[1] = swap32 (w4_t[1]); + w4_t[2] = swap32 (w4_t[2]); + w4_t[3] = swap32 (w4_t[3]); + w5_t[0] = swap32 (w5_t[0]); + w5_t[1] = swap32 (w5_t[1]); + w5_t[2] = swap32 (w5_t[2]); + w5_t[3] = swap32 (w5_t[3]); + w6_t[0] = swap32 (w6_t[0]); + w6_t[1] = swap32 (w6_t[1]); + w6_t[2] = swap32 (w6_t[2]); + w6_t[3] = swap32 (w6_t[3]); + w7_t[0] = swap32 (w7_t[0]); + w7_t[1] = swap32 (w7_t[1]); + w7_t[2] = 0; + w7_t[3] = pw_salt_len * 8; + + /** + * sha512 + */ + + u64x digest[8]; + + digest[0] = SHA512M_A; + digest[1] = SHA512M_B; + digest[2] = SHA512M_C; + digest[3] = SHA512M_D; + digest[4] = SHA512M_E; + digest[5] = SHA512M_F; + digest[6] = SHA512M_G; + digest[7] = SHA512M_H; + + sha512_transform (w0_t, w1_t, w2_t, w3_t, w4_t, w5_t, w6_t, w7_t, digest); + + const u32x r0 = l32_from_64 (digest[7]); + const u32x r1 = h32_from_64 (digest[7]); + const u32x r2 = l32_from_64 (digest[3]); + const u32x r3 = h32_from_64 (digest[3]); + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m15000_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const comb_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ +} + +__kernel void m15000_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const comb_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ +} + +__kernel void m15000_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const comb_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + + /** + * base + */ + + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_len = pws[gid].pw_len; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * salt + */ + + u32 salt_buf0[4]; + u32 salt_buf1[4]; + u32 salt_buf2[4]; + u32 salt_buf3[4]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; + salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; + salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; + salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; + salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; + salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; + salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; + salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; + salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + u32x w0[4] = { 0 }; + u32x w1[4] = { 0 }; + u32x w2[4] = { 0 }; + u32x w3[4] = { 0 }; + + const u32x out_len = apply_rules_vect (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); + + /** + * append salt + */ + + const u32x pw_salt_len = out_len + salt_len; + + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; + u32x w4_t[4]; + u32x w5_t[4]; + u32x w6_t[4]; + u32x w7_t[4]; + + w0_t[0] = salt_buf0[0]; + w0_t[1] = salt_buf0[1]; + w0_t[2] = salt_buf0[2]; + w0_t[3] = salt_buf0[3]; + w1_t[0] = salt_buf1[0]; + w1_t[1] = salt_buf1[1]; + w1_t[2] = salt_buf1[2]; + w1_t[3] = salt_buf1[3]; + w2_t[0] = salt_buf2[0]; + w2_t[1] = salt_buf2[1]; + w2_t[2] = salt_buf2[2]; + w2_t[3] = salt_buf2[3]; + w3_t[0] = salt_buf3[0]; + w3_t[1] = salt_buf3[1]; + w3_t[2] = salt_buf3[2]; + w3_t[3] = salt_buf3[3]; + w4_t[0] = 0x80; + w4_t[1] = 0; + w4_t[2] = 0; + w4_t[3] = 0; + w5_t[0] = 0; + w5_t[1] = 0; + w5_t[2] = 0; + w5_t[3] = 0; + w6_t[0] = 0; + w6_t[1] = 0; + w6_t[2] = 0; + w6_t[3] = 0; + w7_t[0] = 0; + w7_t[1] = 0; + w7_t[2] = 0; + w7_t[3] = 0; + + switch_buffer_by_offset_8x4_le_VV (w0_t, w1_t, w2_t, w3_t, w4_t, w5_t, w6_t, w7_t, out_len); + + w0_t[0] |= w0[0]; + w0_t[1] |= w0[1]; + w0_t[2] |= w0[2]; + w0_t[3] |= w0[3]; + w1_t[0] |= w1[0]; + w1_t[1] |= w1[1]; + w1_t[2] |= w1[2]; + w1_t[3] |= w1[3]; + w2_t[0] |= w2[0]; + w2_t[1] |= w2[1]; + w2_t[2] |= w2[2]; + w2_t[3] |= w2[3]; + w3_t[0] |= w3[0]; + w3_t[1] |= w3[1]; + w3_t[2] |= w3[2]; + w3_t[3] |= w3[3]; + + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); + w3_t[2] = swap32 (w3_t[2]); + w3_t[3] = swap32 (w3_t[3]); + w4_t[0] = swap32 (w4_t[0]); + w4_t[1] = swap32 (w4_t[1]); + w4_t[2] = swap32 (w4_t[2]); + w4_t[3] = swap32 (w4_t[3]); + w5_t[0] = swap32 (w5_t[0]); + w5_t[1] = swap32 (w5_t[1]); + w5_t[2] = swap32 (w5_t[2]); + w5_t[3] = swap32 (w5_t[3]); + w6_t[0] = swap32 (w6_t[0]); + w6_t[1] = swap32 (w6_t[1]); + w6_t[2] = swap32 (w6_t[2]); + w6_t[3] = swap32 (w6_t[3]); + w7_t[0] = swap32 (w7_t[0]); + w7_t[1] = swap32 (w7_t[1]); + w7_t[2] = 0; + w7_t[3] = pw_salt_len * 8; + + /** + * sha512 + */ + + u64x digest[8]; + + digest[0] = SHA512M_A; + digest[1] = SHA512M_B; + digest[2] = SHA512M_C; + digest[3] = SHA512M_D; + digest[4] = SHA512M_E; + digest[5] = SHA512M_F; + digest[6] = SHA512M_G; + digest[7] = SHA512M_H; + + sha512_transform (w0_t, w1_t, w2_t, w3_t, w4_t, w5_t, w6_t, w7_t, digest); + + const u32x r0 = l32_from_64 (digest[7]); + const u32x r1 = h32_from_64 (digest[7]); + const u32x r2 = l32_from_64 (digest[3]); + const u32x r3 = h32_from_64 (digest[3]); + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m15000_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const comb_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ +} + +__kernel void m15000_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const comb_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ +} diff --git a/OpenCL/m15000_a1.cl b/OpenCL/m15000_a1.cl new file mode 100644 index 000000000..86274b263 --- /dev/null +++ b/OpenCL/m15000_a1.cl @@ -0,0 +1,2493 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define _SHA512_ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" + +inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) +{ + #if defined IS_AMD || defined IS_GENERIC + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset; + + switch (offset / 4) + { + case 0: + w7[3] = amd_bytealign_S (w7[3], w7[2], offset_minus_4); + w7[2] = amd_bytealign_S (w7[2], w7[1], offset_minus_4); + w7[1] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); + w7[0] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); + w6[3] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); + w6[2] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w6[1] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w6[0] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w5[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w5[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w5[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w5[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w4[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w4[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w4[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w4[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w3[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w3[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + + if (offset_mod_4 == 0) + { + w0[0] = w0[1]; + w0[1] = w0[2]; + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 1: + w7[3] = amd_bytealign_S (w7[2], w7[1], offset_minus_4); + w7[2] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); + w7[1] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); + w7[0] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); + w6[3] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w6[2] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w6[1] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w6[0] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w5[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w5[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w5[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w5[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w4[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w4[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w4[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w4[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w3[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w3[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w0[1] = w0[2]; + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 2: + w7[3] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); + w7[2] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); + w7[1] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); + w7[0] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w6[3] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w6[2] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w6[1] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w6[0] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w5[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w5[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w5[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w5[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w4[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w4[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w4[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w4[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w3[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 3: + w7[3] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); + w7[2] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); + w7[1] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w7[0] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w6[3] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w6[2] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w6[1] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w6[0] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w5[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w5[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w5[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w5[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w4[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w4[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w4[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w4[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 4: + w7[3] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); + w7[2] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w7[1] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w7[0] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w6[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w6[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w6[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w6[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w5[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w5[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w5[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w5[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w4[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w4[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w4[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w4[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 5: + w7[3] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w7[2] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w7[1] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w7[0] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w6[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w6[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w6[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w6[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w5[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w5[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w5[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w5[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w4[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w4[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w4[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w4[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 6: + w7[3] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w7[2] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w7[1] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w7[0] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w6[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w6[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w6[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w6[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w5[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w5[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w5[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w5[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w4[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w4[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w4[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w4[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 7: + w7[3] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w7[2] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w7[1] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w7[0] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w6[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w6[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w6[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w6[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w5[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w5[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w5[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w5[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w4[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w4[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w4[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w4[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 8: + w7[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w7[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w7[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w7[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w6[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w6[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w6[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w6[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w5[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w5[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w5[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w5[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w4[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w4[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w4[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w4[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 9: + w7[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w7[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w7[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w7[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w6[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w6[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w6[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w6[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w5[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w5[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w5[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w5[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w4[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w4[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w4[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w4[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 10: + w7[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w7[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w7[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w7[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w6[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w6[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w6[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w6[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w5[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w5[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w5[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w5[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w4[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w4[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w4[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w4[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 11: + w7[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w7[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w7[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w7[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w6[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w6[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w6[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w6[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w5[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w5[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w5[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w5[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w4[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w4[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w4[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w4[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 12: + w7[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w7[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w7[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w7[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w6[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w6[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w6[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w6[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w5[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w5[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w5[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w5[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w4[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w4[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w4[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w4[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 13: + w7[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w7[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w7[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w7[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w6[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w6[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w6[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w6[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w5[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w5[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w5[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w5[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w4[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w4[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w4[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w4[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 14: + w7[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w7[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w7[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w7[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w6[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w6[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w6[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w6[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w5[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w5[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w5[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w5[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w4[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w4[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w4[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w4[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 15: + w7[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w7[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w7[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w7[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w6[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w6[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w6[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w6[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w5[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w5[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w5[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w5[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w4[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w4[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w4[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w4[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + } + #endif + + #ifdef IS_NV + const int offset_minus_4 = 4 - (offset % 4); + + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + + switch (offset / 4) + { + case 0: + w7[3] = __byte_perm_S (w7[2], w7[3], selector); + w7[2] = __byte_perm_S (w7[1], w7[2], selector); + w7[1] = __byte_perm_S (w7[0], w7[1], selector); + w7[0] = __byte_perm_S (w6[3], w7[0], selector); + w6[3] = __byte_perm_S (w6[2], w6[3], selector); + w6[2] = __byte_perm_S (w6[1], w6[2], selector); + w6[1] = __byte_perm_S (w6[0], w6[1], selector); + w6[0] = __byte_perm_S (w5[3], w6[0], selector); + w5[3] = __byte_perm_S (w5[2], w5[3], selector); + w5[2] = __byte_perm_S (w5[1], w5[2], selector); + w5[1] = __byte_perm_S (w5[0], w5[1], selector); + w5[0] = __byte_perm_S (w4[3], w5[0], selector); + w4[3] = __byte_perm_S (w4[2], w4[3], selector); + w4[2] = __byte_perm_S (w4[1], w4[2], selector); + w4[1] = __byte_perm_S (w4[0], w4[1], selector); + w4[0] = __byte_perm_S (w3[3], w4[0], selector); + w3[3] = __byte_perm_S (w3[2], w3[3], selector); + w3[2] = __byte_perm_S (w3[1], w3[2], selector); + w3[1] = __byte_perm_S (w3[0], w3[1], selector); + w3[0] = __byte_perm_S (w2[3], w3[0], selector); + w2[3] = __byte_perm_S (w2[2], w2[3], selector); + w2[2] = __byte_perm_S (w2[1], w2[2], selector); + w2[1] = __byte_perm_S (w2[0], w2[1], selector); + w2[0] = __byte_perm_S (w1[3], w2[0], selector); + w1[3] = __byte_perm_S (w1[2], w1[3], selector); + w1[2] = __byte_perm_S (w1[1], w1[2], selector); + w1[1] = __byte_perm_S (w1[0], w1[1], selector); + w1[0] = __byte_perm_S (w0[3], w1[0], selector); + w0[3] = __byte_perm_S (w0[2], w0[3], selector); + w0[2] = __byte_perm_S (w0[1], w0[2], selector); + w0[1] = __byte_perm_S (w0[0], w0[1], selector); + w0[0] = __byte_perm_S ( 0, w0[0], selector); + break; + + case 1: + w7[3] = __byte_perm_S (w7[1], w7[2], selector); + w7[2] = __byte_perm_S (w7[0], w7[1], selector); + w7[1] = __byte_perm_S (w6[3], w7[0], selector); + w7[0] = __byte_perm_S (w6[2], w6[3], selector); + w6[3] = __byte_perm_S (w6[1], w6[2], selector); + w6[2] = __byte_perm_S (w6[0], w6[1], selector); + w6[1] = __byte_perm_S (w5[3], w6[0], selector); + w6[0] = __byte_perm_S (w5[2], w5[3], selector); + w5[3] = __byte_perm_S (w5[1], w5[2], selector); + w5[2] = __byte_perm_S (w5[0], w5[1], selector); + w5[1] = __byte_perm_S (w4[3], w5[0], selector); + w5[0] = __byte_perm_S (w4[2], w4[3], selector); + w4[3] = __byte_perm_S (w4[1], w4[2], selector); + w4[2] = __byte_perm_S (w4[0], w4[1], selector); + w4[1] = __byte_perm_S (w3[3], w4[0], selector); + w4[0] = __byte_perm_S (w3[2], w3[3], selector); + w3[3] = __byte_perm_S (w3[1], w3[2], selector); + w3[2] = __byte_perm_S (w3[0], w3[1], selector); + w3[1] = __byte_perm_S (w2[3], w3[0], selector); + w3[0] = __byte_perm_S (w2[2], w2[3], selector); + w2[3] = __byte_perm_S (w2[1], w2[2], selector); + w2[2] = __byte_perm_S (w2[0], w2[1], selector); + w2[1] = __byte_perm_S (w1[3], w2[0], selector); + w2[0] = __byte_perm_S (w1[2], w1[3], selector); + w1[3] = __byte_perm_S (w1[1], w1[2], selector); + w1[2] = __byte_perm_S (w1[0], w1[1], selector); + w1[1] = __byte_perm_S (w0[3], w1[0], selector); + w1[0] = __byte_perm_S (w0[2], w0[3], selector); + w0[3] = __byte_perm_S (w0[1], w0[2], selector); + w0[2] = __byte_perm_S (w0[0], w0[1], selector); + w0[1] = __byte_perm_S ( 0, w0[0], selector); + w0[0] = 0; + break; + + case 2: + w7[3] = __byte_perm_S (w7[0], w7[1], selector); + w7[2] = __byte_perm_S (w6[3], w7[0], selector); + w7[1] = __byte_perm_S (w6[2], w6[3], selector); + w7[0] = __byte_perm_S (w6[1], w6[2], selector); + w6[3] = __byte_perm_S (w6[0], w6[1], selector); + w6[2] = __byte_perm_S (w5[3], w6[0], selector); + w6[1] = __byte_perm_S (w5[2], w5[3], selector); + w6[0] = __byte_perm_S (w5[1], w5[2], selector); + w5[3] = __byte_perm_S (w5[0], w5[1], selector); + w5[2] = __byte_perm_S (w4[3], w5[0], selector); + w5[1] = __byte_perm_S (w4[2], w4[3], selector); + w5[0] = __byte_perm_S (w4[1], w4[2], selector); + w4[3] = __byte_perm_S (w4[0], w4[1], selector); + w4[2] = __byte_perm_S (w3[3], w4[0], selector); + w4[1] = __byte_perm_S (w3[2], w3[3], selector); + w4[0] = __byte_perm_S (w3[1], w3[2], selector); + w3[3] = __byte_perm_S (w3[0], w3[1], selector); + w3[2] = __byte_perm_S (w2[3], w3[0], selector); + w3[1] = __byte_perm_S (w2[2], w2[3], selector); + w3[0] = __byte_perm_S (w2[1], w2[2], selector); + w2[3] = __byte_perm_S (w2[0], w2[1], selector); + w2[2] = __byte_perm_S (w1[3], w2[0], selector); + w2[1] = __byte_perm_S (w1[2], w1[3], selector); + w2[0] = __byte_perm_S (w1[1], w1[2], selector); + w1[3] = __byte_perm_S (w1[0], w1[1], selector); + w1[2] = __byte_perm_S (w0[3], w1[0], selector); + w1[1] = __byte_perm_S (w0[2], w0[3], selector); + w1[0] = __byte_perm_S (w0[1], w0[2], selector); + w0[3] = __byte_perm_S (w0[0], w0[1], selector); + w0[2] = __byte_perm_S ( 0, w0[0], selector); + w0[1] = 0; + w0[0] = 0; + break; + + case 3: + w7[3] = __byte_perm_S (w6[3], w7[0], selector); + w7[2] = __byte_perm_S (w6[2], w6[3], selector); + w7[1] = __byte_perm_S (w6[1], w6[2], selector); + w7[0] = __byte_perm_S (w6[0], w6[1], selector); + w6[3] = __byte_perm_S (w5[3], w6[0], selector); + w6[2] = __byte_perm_S (w5[2], w5[3], selector); + w6[1] = __byte_perm_S (w5[1], w5[2], selector); + w6[0] = __byte_perm_S (w5[0], w5[1], selector); + w5[3] = __byte_perm_S (w4[3], w5[0], selector); + w5[2] = __byte_perm_S (w4[2], w4[3], selector); + w5[1] = __byte_perm_S (w4[1], w4[2], selector); + w5[0] = __byte_perm_S (w4[0], w4[1], selector); + w4[3] = __byte_perm_S (w3[3], w4[0], selector); + w4[2] = __byte_perm_S (w3[2], w3[3], selector); + w4[1] = __byte_perm_S (w3[1], w3[2], selector); + w4[0] = __byte_perm_S (w3[0], w3[1], selector); + w3[3] = __byte_perm_S (w2[3], w3[0], selector); + w3[2] = __byte_perm_S (w2[2], w2[3], selector); + w3[1] = __byte_perm_S (w2[1], w2[2], selector); + w3[0] = __byte_perm_S (w2[0], w2[1], selector); + w2[3] = __byte_perm_S (w1[3], w2[0], selector); + w2[2] = __byte_perm_S (w1[2], w1[3], selector); + w2[1] = __byte_perm_S (w1[1], w1[2], selector); + w2[0] = __byte_perm_S (w1[0], w1[1], selector); + w1[3] = __byte_perm_S (w0[3], w1[0], selector); + w1[2] = __byte_perm_S (w0[2], w0[3], selector); + w1[1] = __byte_perm_S (w0[1], w0[2], selector); + w1[0] = __byte_perm_S (w0[0], w0[1], selector); + w0[3] = __byte_perm_S ( 0, w0[0], selector); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 4: + w7[3] = __byte_perm_S (w6[2], w6[3], selector); + w7[2] = __byte_perm_S (w6[1], w6[2], selector); + w7[1] = __byte_perm_S (w6[0], w6[1], selector); + w7[0] = __byte_perm_S (w5[3], w6[0], selector); + w6[3] = __byte_perm_S (w5[2], w5[3], selector); + w6[2] = __byte_perm_S (w5[1], w5[2], selector); + w6[1] = __byte_perm_S (w5[0], w5[1], selector); + w6[0] = __byte_perm_S (w4[3], w5[0], selector); + w5[3] = __byte_perm_S (w4[2], w4[3], selector); + w5[2] = __byte_perm_S (w4[1], w4[2], selector); + w5[1] = __byte_perm_S (w4[0], w4[1], selector); + w5[0] = __byte_perm_S (w3[3], w4[0], selector); + w4[3] = __byte_perm_S (w3[2], w3[3], selector); + w4[2] = __byte_perm_S (w3[1], w3[2], selector); + w4[1] = __byte_perm_S (w3[0], w3[1], selector); + w4[0] = __byte_perm_S (w2[3], w3[0], selector); + w3[3] = __byte_perm_S (w2[2], w2[3], selector); + w3[2] = __byte_perm_S (w2[1], w2[2], selector); + w3[1] = __byte_perm_S (w2[0], w2[1], selector); + w3[0] = __byte_perm_S (w1[3], w2[0], selector); + w2[3] = __byte_perm_S (w1[2], w1[3], selector); + w2[2] = __byte_perm_S (w1[1], w1[2], selector); + w2[1] = __byte_perm_S (w1[0], w1[1], selector); + w2[0] = __byte_perm_S (w0[3], w1[0], selector); + w1[3] = __byte_perm_S (w0[2], w0[3], selector); + w1[2] = __byte_perm_S (w0[1], w0[2], selector); + w1[1] = __byte_perm_S (w0[0], w0[1], selector); + w1[0] = __byte_perm_S ( 0, w0[0], selector); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 5: + w7[3] = __byte_perm_S (w6[1], w6[2], selector); + w7[2] = __byte_perm_S (w6[0], w6[1], selector); + w7[1] = __byte_perm_S (w5[3], w6[0], selector); + w7[0] = __byte_perm_S (w5[2], w5[3], selector); + w6[3] = __byte_perm_S (w5[1], w5[2], selector); + w6[2] = __byte_perm_S (w5[0], w5[1], selector); + w6[1] = __byte_perm_S (w4[3], w5[0], selector); + w6[0] = __byte_perm_S (w4[2], w4[3], selector); + w5[3] = __byte_perm_S (w4[1], w4[2], selector); + w5[2] = __byte_perm_S (w4[0], w4[1], selector); + w5[1] = __byte_perm_S (w3[3], w4[0], selector); + w5[0] = __byte_perm_S (w3[2], w3[3], selector); + w4[3] = __byte_perm_S (w3[1], w3[2], selector); + w4[2] = __byte_perm_S (w3[0], w3[1], selector); + w4[1] = __byte_perm_S (w2[3], w3[0], selector); + w4[0] = __byte_perm_S (w2[2], w2[3], selector); + w3[3] = __byte_perm_S (w2[1], w2[2], selector); + w3[2] = __byte_perm_S (w2[0], w2[1], selector); + w3[1] = __byte_perm_S (w1[3], w2[0], selector); + w3[0] = __byte_perm_S (w1[2], w1[3], selector); + w2[3] = __byte_perm_S (w1[1], w1[2], selector); + w2[2] = __byte_perm_S (w1[0], w1[1], selector); + w2[1] = __byte_perm_S (w0[3], w1[0], selector); + w2[0] = __byte_perm_S (w0[2], w0[3], selector); + w1[3] = __byte_perm_S (w0[1], w0[2], selector); + w1[2] = __byte_perm_S (w0[0], w0[1], selector); + w1[1] = __byte_perm_S ( 0, w0[0], selector); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 6: + w7[3] = __byte_perm_S (w6[0], w6[1], selector); + w7[2] = __byte_perm_S (w5[3], w6[0], selector); + w7[1] = __byte_perm_S (w5[2], w5[3], selector); + w7[0] = __byte_perm_S (w5[1], w5[2], selector); + w6[3] = __byte_perm_S (w5[0], w5[1], selector); + w6[2] = __byte_perm_S (w4[3], w5[0], selector); + w6[1] = __byte_perm_S (w4[2], w4[3], selector); + w6[0] = __byte_perm_S (w4[1], w4[2], selector); + w5[3] = __byte_perm_S (w4[0], w4[1], selector); + w5[2] = __byte_perm_S (w3[3], w4[0], selector); + w5[1] = __byte_perm_S (w3[2], w3[3], selector); + w5[0] = __byte_perm_S (w3[1], w3[2], selector); + w4[3] = __byte_perm_S (w3[0], w3[1], selector); + w4[2] = __byte_perm_S (w2[3], w3[0], selector); + w4[1] = __byte_perm_S (w2[2], w2[3], selector); + w4[0] = __byte_perm_S (w2[1], w2[2], selector); + w3[3] = __byte_perm_S (w2[0], w2[1], selector); + w3[2] = __byte_perm_S (w1[3], w2[0], selector); + w3[1] = __byte_perm_S (w1[2], w1[3], selector); + w3[0] = __byte_perm_S (w1[1], w1[2], selector); + w2[3] = __byte_perm_S (w1[0], w1[1], selector); + w2[2] = __byte_perm_S (w0[3], w1[0], selector); + w2[1] = __byte_perm_S (w0[2], w0[3], selector); + w2[0] = __byte_perm_S (w0[1], w0[2], selector); + w1[3] = __byte_perm_S (w0[0], w0[1], selector); + w1[2] = __byte_perm_S ( 0, w0[0], selector); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 7: + w7[3] = __byte_perm_S (w5[3], w6[0], selector); + w7[2] = __byte_perm_S (w5[2], w5[3], selector); + w7[1] = __byte_perm_S (w5[1], w5[2], selector); + w7[0] = __byte_perm_S (w5[0], w5[1], selector); + w6[3] = __byte_perm_S (w4[3], w5[0], selector); + w6[2] = __byte_perm_S (w4[2], w4[3], selector); + w6[1] = __byte_perm_S (w4[1], w4[2], selector); + w6[0] = __byte_perm_S (w4[0], w4[1], selector); + w5[3] = __byte_perm_S (w3[3], w4[0], selector); + w5[2] = __byte_perm_S (w3[2], w3[3], selector); + w5[1] = __byte_perm_S (w3[1], w3[2], selector); + w5[0] = __byte_perm_S (w3[0], w3[1], selector); + w4[3] = __byte_perm_S (w2[3], w3[0], selector); + w4[2] = __byte_perm_S (w2[2], w2[3], selector); + w4[1] = __byte_perm_S (w2[1], w2[2], selector); + w4[0] = __byte_perm_S (w2[0], w2[1], selector); + w3[3] = __byte_perm_S (w1[3], w2[0], selector); + w3[2] = __byte_perm_S (w1[2], w1[3], selector); + w3[1] = __byte_perm_S (w1[1], w1[2], selector); + w3[0] = __byte_perm_S (w1[0], w1[1], selector); + w2[3] = __byte_perm_S (w0[3], w1[0], selector); + w2[2] = __byte_perm_S (w0[2], w0[3], selector); + w2[1] = __byte_perm_S (w0[1], w0[2], selector); + w2[0] = __byte_perm_S (w0[0], w0[1], selector); + w1[3] = __byte_perm_S ( 0, w0[0], selector); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 8: + w7[3] = __byte_perm_S (w5[2], w5[3], selector); + w7[2] = __byte_perm_S (w5[1], w5[2], selector); + w7[1] = __byte_perm_S (w5[0], w5[1], selector); + w7[0] = __byte_perm_S (w4[3], w5[0], selector); + w6[3] = __byte_perm_S (w4[2], w4[3], selector); + w6[2] = __byte_perm_S (w4[1], w4[2], selector); + w6[1] = __byte_perm_S (w4[0], w4[1], selector); + w6[0] = __byte_perm_S (w3[3], w4[0], selector); + w5[3] = __byte_perm_S (w3[2], w3[3], selector); + w5[2] = __byte_perm_S (w3[1], w3[2], selector); + w5[1] = __byte_perm_S (w3[0], w3[1], selector); + w5[0] = __byte_perm_S (w2[3], w3[0], selector); + w4[3] = __byte_perm_S (w2[2], w2[3], selector); + w4[2] = __byte_perm_S (w2[1], w2[2], selector); + w4[1] = __byte_perm_S (w2[0], w2[1], selector); + w4[0] = __byte_perm_S (w1[3], w2[0], selector); + w3[3] = __byte_perm_S (w1[2], w1[3], selector); + w3[2] = __byte_perm_S (w1[1], w1[2], selector); + w3[1] = __byte_perm_S (w1[0], w1[1], selector); + w3[0] = __byte_perm_S (w0[3], w1[0], selector); + w2[3] = __byte_perm_S (w0[2], w0[3], selector); + w2[2] = __byte_perm_S (w0[1], w0[2], selector); + w2[1] = __byte_perm_S (w0[0], w0[1], selector); + w2[0] = __byte_perm_S ( 0, w0[0], selector); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 9: + w7[3] = __byte_perm_S (w5[1], w5[2], selector); + w7[2] = __byte_perm_S (w5[0], w5[1], selector); + w7[1] = __byte_perm_S (w4[3], w5[0], selector); + w7[0] = __byte_perm_S (w4[2], w4[3], selector); + w6[3] = __byte_perm_S (w4[1], w4[2], selector); + w6[2] = __byte_perm_S (w4[0], w4[1], selector); + w6[1] = __byte_perm_S (w3[3], w4[0], selector); + w6[0] = __byte_perm_S (w3[2], w3[3], selector); + w5[3] = __byte_perm_S (w3[1], w3[2], selector); + w5[2] = __byte_perm_S (w3[0], w3[1], selector); + w5[1] = __byte_perm_S (w2[3], w3[0], selector); + w5[0] = __byte_perm_S (w2[2], w2[3], selector); + w4[3] = __byte_perm_S (w2[1], w2[2], selector); + w4[2] = __byte_perm_S (w2[0], w2[1], selector); + w4[1] = __byte_perm_S (w1[3], w2[0], selector); + w4[0] = __byte_perm_S (w1[2], w1[3], selector); + w3[3] = __byte_perm_S (w1[1], w1[2], selector); + w3[2] = __byte_perm_S (w1[0], w1[1], selector); + w3[1] = __byte_perm_S (w0[3], w1[0], selector); + w3[0] = __byte_perm_S (w0[2], w0[3], selector); + w2[3] = __byte_perm_S (w0[1], w0[2], selector); + w2[2] = __byte_perm_S (w0[0], w0[1], selector); + w2[1] = __byte_perm_S ( 0, w0[0], selector); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 10: + w7[3] = __byte_perm_S (w5[0], w5[1], selector); + w7[2] = __byte_perm_S (w4[3], w5[0], selector); + w7[1] = __byte_perm_S (w4[2], w4[3], selector); + w7[0] = __byte_perm_S (w4[1], w4[2], selector); + w6[3] = __byte_perm_S (w4[0], w4[1], selector); + w6[2] = __byte_perm_S (w3[3], w4[0], selector); + w6[1] = __byte_perm_S (w3[2], w3[3], selector); + w6[0] = __byte_perm_S (w3[1], w3[2], selector); + w5[3] = __byte_perm_S (w3[0], w3[1], selector); + w5[2] = __byte_perm_S (w2[3], w3[0], selector); + w5[1] = __byte_perm_S (w2[2], w2[3], selector); + w5[0] = __byte_perm_S (w2[1], w2[2], selector); + w4[3] = __byte_perm_S (w2[0], w2[1], selector); + w4[2] = __byte_perm_S (w1[3], w2[0], selector); + w4[1] = __byte_perm_S (w1[2], w1[3], selector); + w4[0] = __byte_perm_S (w1[1], w1[2], selector); + w3[3] = __byte_perm_S (w1[0], w1[1], selector); + w3[2] = __byte_perm_S (w0[3], w1[0], selector); + w3[1] = __byte_perm_S (w0[2], w0[3], selector); + w3[0] = __byte_perm_S (w0[1], w0[2], selector); + w2[3] = __byte_perm_S (w0[0], w0[1], selector); + w2[2] = __byte_perm_S ( 0, w0[0], selector); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 11: + w7[3] = __byte_perm_S (w4[3], w5[0], selector); + w7[2] = __byte_perm_S (w4[2], w4[3], selector); + w7[1] = __byte_perm_S (w4[1], w4[2], selector); + w7[0] = __byte_perm_S (w4[0], w4[1], selector); + w6[3] = __byte_perm_S (w3[3], w4[0], selector); + w6[2] = __byte_perm_S (w3[2], w3[3], selector); + w6[1] = __byte_perm_S (w3[1], w3[2], selector); + w6[0] = __byte_perm_S (w3[0], w3[1], selector); + w5[3] = __byte_perm_S (w2[3], w3[0], selector); + w5[2] = __byte_perm_S (w2[2], w2[3], selector); + w5[1] = __byte_perm_S (w2[1], w2[2], selector); + w5[0] = __byte_perm_S (w2[0], w2[1], selector); + w4[3] = __byte_perm_S (w1[3], w2[0], selector); + w4[2] = __byte_perm_S (w1[2], w1[3], selector); + w4[1] = __byte_perm_S (w1[1], w1[2], selector); + w4[0] = __byte_perm_S (w1[0], w1[1], selector); + w3[3] = __byte_perm_S (w0[3], w1[0], selector); + w3[2] = __byte_perm_S (w0[2], w0[3], selector); + w3[1] = __byte_perm_S (w0[1], w0[2], selector); + w3[0] = __byte_perm_S (w0[0], w0[1], selector); + w2[3] = __byte_perm_S ( 0, w0[0], selector); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 12: + w7[3] = __byte_perm_S (w4[2], w4[3], selector); + w7[2] = __byte_perm_S (w4[1], w4[2], selector); + w7[1] = __byte_perm_S (w4[0], w4[1], selector); + w7[0] = __byte_perm_S (w3[3], w4[0], selector); + w6[3] = __byte_perm_S (w3[2], w3[3], selector); + w6[2] = __byte_perm_S (w3[1], w3[2], selector); + w6[1] = __byte_perm_S (w3[0], w3[1], selector); + w6[0] = __byte_perm_S (w2[3], w3[0], selector); + w5[3] = __byte_perm_S (w2[2], w2[3], selector); + w5[2] = __byte_perm_S (w2[1], w2[2], selector); + w5[1] = __byte_perm_S (w2[0], w2[1], selector); + w5[0] = __byte_perm_S (w1[3], w2[0], selector); + w4[3] = __byte_perm_S (w1[2], w1[3], selector); + w4[2] = __byte_perm_S (w1[1], w1[2], selector); + w4[1] = __byte_perm_S (w1[0], w1[1], selector); + w4[0] = __byte_perm_S (w0[3], w1[0], selector); + w3[3] = __byte_perm_S (w0[2], w0[3], selector); + w3[2] = __byte_perm_S (w0[1], w0[2], selector); + w3[1] = __byte_perm_S (w0[0], w0[1], selector); + w3[0] = __byte_perm_S ( 0, w0[0], selector); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 13: + w7[3] = __byte_perm_S (w4[1], w4[2], selector); + w7[2] = __byte_perm_S (w4[0], w4[1], selector); + w7[1] = __byte_perm_S (w3[3], w4[0], selector); + w7[0] = __byte_perm_S (w3[2], w3[3], selector); + w6[3] = __byte_perm_S (w3[1], w3[2], selector); + w6[2] = __byte_perm_S (w3[0], w3[1], selector); + w6[1] = __byte_perm_S (w2[3], w3[0], selector); + w6[0] = __byte_perm_S (w2[2], w2[3], selector); + w5[3] = __byte_perm_S (w2[1], w2[2], selector); + w5[2] = __byte_perm_S (w2[0], w2[1], selector); + w5[1] = __byte_perm_S (w1[3], w2[0], selector); + w5[0] = __byte_perm_S (w1[2], w1[3], selector); + w4[3] = __byte_perm_S (w1[1], w1[2], selector); + w4[2] = __byte_perm_S (w1[0], w1[1], selector); + w4[1] = __byte_perm_S (w0[3], w1[0], selector); + w4[0] = __byte_perm_S (w0[2], w0[3], selector); + w3[3] = __byte_perm_S (w0[1], w0[2], selector); + w3[2] = __byte_perm_S (w0[0], w0[1], selector); + w3[1] = __byte_perm_S ( 0, w0[0], selector); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 14: + w7[3] = __byte_perm_S (w4[0], w4[1], selector); + w7[2] = __byte_perm_S (w3[3], w4[0], selector); + w7[1] = __byte_perm_S (w3[2], w3[3], selector); + w7[0] = __byte_perm_S (w3[1], w3[2], selector); + w6[3] = __byte_perm_S (w3[0], w3[1], selector); + w6[2] = __byte_perm_S (w2[3], w3[0], selector); + w6[1] = __byte_perm_S (w2[2], w2[3], selector); + w6[0] = __byte_perm_S (w2[1], w2[2], selector); + w5[3] = __byte_perm_S (w2[0], w2[1], selector); + w5[2] = __byte_perm_S (w1[3], w2[0], selector); + w5[1] = __byte_perm_S (w1[2], w1[3], selector); + w5[0] = __byte_perm_S (w1[1], w1[2], selector); + w4[3] = __byte_perm_S (w1[0], w1[1], selector); + w4[2] = __byte_perm_S (w0[3], w1[0], selector); + w4[1] = __byte_perm_S (w0[2], w0[3], selector); + w4[0] = __byte_perm_S (w0[1], w0[2], selector); + w3[3] = __byte_perm_S (w0[0], w0[1], selector); + w3[2] = __byte_perm_S ( 0, w0[0], selector); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 15: + w7[3] = __byte_perm_S (w3[3], w4[0], selector); + w7[2] = __byte_perm_S (w3[2], w3[3], selector); + w7[1] = __byte_perm_S (w3[1], w3[2], selector); + w7[0] = __byte_perm_S (w3[0], w3[1], selector); + w6[3] = __byte_perm_S (w2[3], w3[0], selector); + w6[2] = __byte_perm_S (w2[2], w2[3], selector); + w6[1] = __byte_perm_S (w2[1], w2[2], selector); + w6[0] = __byte_perm_S (w2[0], w2[1], selector); + w5[3] = __byte_perm_S (w1[3], w2[0], selector); + w5[2] = __byte_perm_S (w1[2], w1[3], selector); + w5[1] = __byte_perm_S (w1[1], w1[2], selector); + w5[0] = __byte_perm_S (w1[0], w1[1], selector); + w4[3] = __byte_perm_S (w0[3], w1[0], selector); + w4[2] = __byte_perm_S (w0[2], w0[3], selector); + w4[1] = __byte_perm_S (w0[1], w0[2], selector); + w4[0] = __byte_perm_S (w0[0], w0[1], selector); + w3[3] = __byte_perm_S ( 0, w0[0], selector); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + } + #endif +} + +#define PACKVS84(s0,s1,s2,s3,s4,s5,s6,s7,v0,v1,v2,v3,v4,v5,v6,v7,e) \ + PACKVS4 (s0, v0, e); \ + PACKVS4 (s1, v1, e); \ + PACKVS4 (s2, v2, e); \ + PACKVS4 (s3, v3, e); \ + PACKVS4 (s4, v4, e); \ + PACKVS4 (s5, v5, e); \ + PACKVS4 (s6, v6, e); \ + PACKVS4 (s7, v7, e); + +#define PACKSV84(s0,s1,s2,s3,s4,s5,s6,s7,v0,v1,v2,v3,v4,v5,v6,v7,e) \ + PACKSV4 (s0, v0, e); \ + PACKSV4 (s1, v1, e); \ + PACKSV4 (s2, v2, e); \ + PACKSV4 (s3, v3, e); \ + PACKSV4 (s4, v4, e); \ + PACKSV4 (s5, v5, e); \ + PACKSV4 (s6, v6, e); \ + PACKSV4 (s7, v7, e); + +inline void switch_buffer_by_offset_8x4_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32x offset) +{ + #if VECT_SIZE == 1 + + switch_buffer_by_offset_8x4_le_S (w0, w1, w2, w3, w4, w5, w6, w7, offset); + + #else + + u32 t0[4]; + u32 t1[4]; + u32 t2[4]; + u32 t3[4]; + u32 t4[4]; + u32 t5[4]; + u32 t6[4]; + u32 t7[4]; + + #endif + + #if VECT_SIZE == 2 + + // 1 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s0); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); + + // 2 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s1); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); + + #elif VECT_SIZE == 4 + + // 1 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s0); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); + + // 2 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s1); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); + + // 3 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s2); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); + + // 4 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s3); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); + + #elif VECT_SIZE == 8 + + // 1 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s0); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); + + // 2 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s1); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); + + // 3 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s2); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); + + // 4 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s3); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); + + // 5 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 4); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s4); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 4); + + // 6 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 5); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s5); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 5); + + // 7 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 6); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s6); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 6); + + // 8 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 7); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s7); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 7); + + #elif VECT_SIZE == 16 + + // 1 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s0); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); + + // 2 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s1); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); + + // 3 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s2); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); + + // 4 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s3); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); + + // 5 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 4); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s4); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 4); + + // 6 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 5); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s5); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 5); + + // 7 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 6); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s6); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 6); + + // 8 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 7); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s7); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 7); + + // 9 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 8); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s8); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 8); + + // 10 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 9); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s9); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 9); + + // 11 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, a); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sa); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, a); + + // 12 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, b); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sb); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, b); + + // 13 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, c); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sc); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, c); + + // 14 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, d); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sd); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, d); + + // 15 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, e); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.se); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, e); + + // 16 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, f); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sf); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, f); + + #endif +} + +__constant u64 k_sha512[80] = +{ + SHA512C00, SHA512C01, SHA512C02, SHA512C03, + SHA512C04, SHA512C05, SHA512C06, SHA512C07, + SHA512C08, SHA512C09, SHA512C0a, SHA512C0b, + SHA512C0c, SHA512C0d, SHA512C0e, SHA512C0f, + SHA512C10, SHA512C11, SHA512C12, SHA512C13, + SHA512C14, SHA512C15, SHA512C16, SHA512C17, + SHA512C18, SHA512C19, SHA512C1a, SHA512C1b, + SHA512C1c, SHA512C1d, SHA512C1e, SHA512C1f, + SHA512C20, SHA512C21, SHA512C22, SHA512C23, + SHA512C24, SHA512C25, SHA512C26, SHA512C27, + SHA512C28, SHA512C29, SHA512C2a, SHA512C2b, + SHA512C2c, SHA512C2d, SHA512C2e, SHA512C2f, + SHA512C30, SHA512C31, SHA512C32, SHA512C33, + SHA512C34, SHA512C35, SHA512C36, SHA512C37, + SHA512C38, SHA512C39, SHA512C3a, SHA512C3b, + SHA512C3c, SHA512C3d, SHA512C3e, SHA512C3f, + SHA512C40, SHA512C41, SHA512C42, SHA512C43, + SHA512C44, SHA512C45, SHA512C46, SHA512C47, + SHA512C48, SHA512C49, SHA512C4a, SHA512C4b, + SHA512C4c, SHA512C4d, SHA512C4e, SHA512C4f, +}; + +static void sha512_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], const u32x w4[4], const u32x w5[4], const u32x w6[4], const u32x w7[4], u64x digest[8]) +{ + u64x w0_t = hl32_to_64 (w0[0], w0[1]); + u64x w1_t = hl32_to_64 (w0[2], w0[3]); + u64x w2_t = hl32_to_64 (w1[0], w1[1]); + u64x w3_t = hl32_to_64 (w1[2], w1[3]); + u64x w4_t = hl32_to_64 (w2[0], w2[1]); + u64x w5_t = hl32_to_64 (w2[2], w2[3]); + u64x w6_t = hl32_to_64 (w3[0], w3[1]); + u64x w7_t = hl32_to_64 (w3[2], w3[3]); + u64x w8_t = hl32_to_64 (w4[0], w4[1]); + u64x w9_t = hl32_to_64 (w4[2], w4[3]); + u64x wa_t = hl32_to_64 (w5[0], w5[1]); + u64x wb_t = hl32_to_64 (w5[2], w5[3]); + u64x wc_t = hl32_to_64 (w6[0], w6[1]); + u64x wd_t = hl32_to_64 (w6[2], w6[3]); + u64x we_t = hl32_to_64 (w7[0], w7[1]); + u64x wf_t = hl32_to_64 (w7[2], w7[3]); + + u64x a = digest[0]; + u64x b = digest[1]; + u64x c = digest[2]; + u64x d = digest[3]; + u64x e = digest[4]; + u64x f = digest[5]; + u64x g = digest[6]; + u64x h = digest[7]; + + #define ROUND_EXPAND() \ + { \ + w0_t = SHA512_EXPAND (we_t, w9_t, w1_t, w0_t); \ + w1_t = SHA512_EXPAND (wf_t, wa_t, w2_t, w1_t); \ + w2_t = SHA512_EXPAND (w0_t, wb_t, w3_t, w2_t); \ + w3_t = SHA512_EXPAND (w1_t, wc_t, w4_t, w3_t); \ + w4_t = SHA512_EXPAND (w2_t, wd_t, w5_t, w4_t); \ + w5_t = SHA512_EXPAND (w3_t, we_t, w6_t, w5_t); \ + w6_t = SHA512_EXPAND (w4_t, wf_t, w7_t, w6_t); \ + w7_t = SHA512_EXPAND (w5_t, w0_t, w8_t, w7_t); \ + w8_t = SHA512_EXPAND (w6_t, w1_t, w9_t, w8_t); \ + w9_t = SHA512_EXPAND (w7_t, w2_t, wa_t, w9_t); \ + wa_t = SHA512_EXPAND (w8_t, w3_t, wb_t, wa_t); \ + wb_t = SHA512_EXPAND (w9_t, w4_t, wc_t, wb_t); \ + wc_t = SHA512_EXPAND (wa_t, w5_t, wd_t, wc_t); \ + wd_t = SHA512_EXPAND (wb_t, w6_t, we_t, wd_t); \ + we_t = SHA512_EXPAND (wc_t, w7_t, wf_t, we_t); \ + wf_t = SHA512_EXPAND (wd_t, w8_t, w0_t, wf_t); \ + } + + #define ROUND_STEP(i) \ + { \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, a, b, c, d, e, f, g, h, w0_t, k_sha512[i + 0]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, h, a, b, c, d, e, f, g, w1_t, k_sha512[i + 1]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, g, h, a, b, c, d, e, f, w2_t, k_sha512[i + 2]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, f, g, h, a, b, c, d, e, w3_t, k_sha512[i + 3]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, e, f, g, h, a, b, c, d, w4_t, k_sha512[i + 4]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, d, e, f, g, h, a, b, c, w5_t, k_sha512[i + 5]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, c, d, e, f, g, h, a, b, w6_t, k_sha512[i + 6]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, b, c, d, e, f, g, h, a, w7_t, k_sha512[i + 7]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, a, b, c, d, e, f, g, h, w8_t, k_sha512[i + 8]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, h, a, b, c, d, e, f, g, w9_t, k_sha512[i + 9]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, g, h, a, b, c, d, e, f, wa_t, k_sha512[i + 10]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, f, g, h, a, b, c, d, e, wb_t, k_sha512[i + 11]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, e, f, g, h, a, b, c, d, wc_t, k_sha512[i + 12]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, d, e, f, g, h, a, b, c, wd_t, k_sha512[i + 13]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, c, d, e, f, g, h, a, b, we_t, k_sha512[i + 14]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, b, c, d, e, f, g, h, a, wf_t, k_sha512[i + 15]); \ + } + + ROUND_STEP (0); + + #ifdef _unroll + #pragma unroll + #endif + for (int i = 16; i < 80; i += 16) + { + ROUND_EXPAND (); ROUND_STEP (i); + } + + /* rev + digest[0] += a; + digest[1] += b; + digest[2] += c; + digest[3] += d; + digest[4] += e; + digest[5] += f; + digest[6] += g; + digest[7] += h; + */ + + digest[0] = a; + digest[1] = b; + digest[2] = c; + digest[3] = d; + digest[4] = e; + digest[5] = f; + digest[6] = g; + digest[7] = h; +} + +__kernel void m15000_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const comb_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + + /** + * base + */ + + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_l_len = pws[gid].pw_len; + + /** + * salt + */ + + u32 salt_buf0[4]; + u32 salt_buf1[4]; + u32 salt_buf2[4]; + u32 salt_buf3[4]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; + salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; + salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; + salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; + salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; + salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; + salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; + salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; + salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + + const u32x pw_len = pw_l_len + pw_r_len; + + /** + * concat password candidate + */ + + u32x wordl0[4] = { 0 }; + u32x wordl1[4] = { 0 }; + u32x wordl2[4] = { 0 }; + u32x wordl3[4] = { 0 }; + + wordl0[0] = pw_buf0[0]; + wordl0[1] = pw_buf0[1]; + wordl0[2] = pw_buf0[2]; + wordl0[3] = pw_buf0[3]; + wordl1[0] = pw_buf1[0]; + wordl1[1] = pw_buf1[1]; + wordl1[2] = pw_buf1[2]; + wordl1[3] = pw_buf1[3]; + + u32x wordr0[4] = { 0 }; + u32x wordr1[4] = { 0 }; + u32x wordr2[4] = { 0 }; + u32x wordr3[4] = { 0 }; + + wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); + wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); + wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); + wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); + wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); + wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); + wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); + wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + + if (combs_mode == COMBINATOR_MODE_BASE_LEFT) + { + switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len); + } + else + { + switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len); + } + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = wordl0[0] | wordr0[0]; + w0[1] = wordl0[1] | wordr0[1]; + w0[2] = wordl0[2] | wordr0[2]; + w0[3] = wordl0[3] | wordr0[3]; + w1[0] = wordl1[0] | wordr1[0]; + w1[1] = wordl1[1] | wordr1[1]; + w1[2] = wordl1[2] | wordr1[2]; + w1[3] = wordl1[3] | wordr1[3]; + w2[0] = wordl2[0] | wordr2[0]; + w2[1] = wordl2[1] | wordr2[1]; + w2[2] = wordl2[2] | wordr2[2]; + w2[3] = wordl2[3] | wordr2[3]; + w3[0] = wordl3[0] | wordr3[0]; + w3[1] = wordl3[1] | wordr3[1]; + w3[2] = wordl3[2] | wordr3[2]; + w3[3] = wordl3[3] | wordr3[3]; + + /** + * append salt + */ + + const u32x pw_salt_len = pw_len + salt_len; + + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; + u32x w4_t[4]; + u32x w5_t[4]; + u32x w6_t[4]; + u32x w7_t[4]; + + w0_t[0] = salt_buf0[0]; + w0_t[1] = salt_buf0[1]; + w0_t[2] = salt_buf0[2]; + w0_t[3] = salt_buf0[3]; + w1_t[0] = salt_buf1[0]; + w1_t[1] = salt_buf1[1]; + w1_t[2] = salt_buf1[2]; + w1_t[3] = salt_buf1[3]; + w2_t[0] = salt_buf2[0]; + w2_t[1] = salt_buf2[1]; + w2_t[2] = salt_buf2[2]; + w2_t[3] = salt_buf2[3]; + w3_t[0] = salt_buf3[0]; + w3_t[1] = salt_buf3[1]; + w3_t[2] = salt_buf3[2]; + w3_t[3] = salt_buf3[3]; + w4_t[0] = 0x80; + w4_t[1] = 0; + w4_t[2] = 0; + w4_t[3] = 0; + w5_t[0] = 0; + w5_t[1] = 0; + w5_t[2] = 0; + w5_t[3] = 0; + w6_t[0] = 0; + w6_t[1] = 0; + w6_t[2] = 0; + w6_t[3] = 0; + w7_t[0] = 0; + w7_t[1] = 0; + w7_t[2] = 0; + w7_t[3] = 0; + + switch_buffer_by_offset_8x4_le_VV (w0_t, w1_t, w2_t, w3_t, w4_t, w5_t, w6_t, w7_t, pw_len); + + w0_t[0] |= w0[0]; + w0_t[1] |= w0[1]; + w0_t[2] |= w0[2]; + w0_t[3] |= w0[3]; + w1_t[0] |= w1[0]; + w1_t[1] |= w1[1]; + w1_t[2] |= w1[2]; + w1_t[3] |= w1[3]; + w2_t[0] |= w2[0]; + w2_t[1] |= w2[1]; + w2_t[2] |= w2[2]; + w2_t[3] |= w2[3]; + w3_t[0] |= w3[0]; + w3_t[1] |= w3[1]; + w3_t[2] |= w3[2]; + w3_t[3] |= w3[3]; + + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); + w3_t[2] = swap32 (w3_t[2]); + w3_t[3] = swap32 (w3_t[3]); + w4_t[0] = swap32 (w4_t[0]); + w4_t[1] = swap32 (w4_t[1]); + w4_t[2] = swap32 (w4_t[2]); + w4_t[3] = swap32 (w4_t[3]); + w5_t[0] = swap32 (w5_t[0]); + w5_t[1] = swap32 (w5_t[1]); + w5_t[2] = swap32 (w5_t[2]); + w5_t[3] = swap32 (w5_t[3]); + w6_t[0] = swap32 (w6_t[0]); + w6_t[1] = swap32 (w6_t[1]); + w6_t[2] = swap32 (w6_t[2]); + w6_t[3] = swap32 (w6_t[3]); + w7_t[0] = swap32 (w7_t[0]); + w7_t[1] = swap32 (w7_t[1]); + w7_t[2] = 0; + w7_t[3] = pw_salt_len * 8; + + /** + * sha512 + */ + + u64x digest[8]; + + digest[0] = SHA512M_A; + digest[1] = SHA512M_B; + digest[2] = SHA512M_C; + digest[3] = SHA512M_D; + digest[4] = SHA512M_E; + digest[5] = SHA512M_F; + digest[6] = SHA512M_G; + digest[7] = SHA512M_H; + + sha512_transform (w0_t, w1_t, w2_t, w3_t, w4_t, w5_t, w6_t, w7_t, digest); + + const u32x r0 = l32_from_64 (digest[7]); + const u32x r1 = h32_from_64 (digest[7]); + const u32x r2 = l32_from_64 (digest[3]); + const u32x r3 = h32_from_64 (digest[3]); + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m15000_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const comb_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ +} + +__kernel void m15000_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const comb_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ +} + +__kernel void m15000_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const comb_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + + /** + * base + */ + + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_l_len = pws[gid].pw_len; + + /** + * salt + */ + + u32 salt_buf0[4]; + u32 salt_buf1[4]; + u32 salt_buf2[4]; + u32 salt_buf3[4]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; + salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; + salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; + salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; + salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; + salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; + salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; + salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; + salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + + const u32x pw_len = pw_l_len + pw_r_len; + + /** + * concat password candidate + */ + + u32x wordl0[4] = { 0 }; + u32x wordl1[4] = { 0 }; + u32x wordl2[4] = { 0 }; + u32x wordl3[4] = { 0 }; + + wordl0[0] = pw_buf0[0]; + wordl0[1] = pw_buf0[1]; + wordl0[2] = pw_buf0[2]; + wordl0[3] = pw_buf0[3]; + wordl1[0] = pw_buf1[0]; + wordl1[1] = pw_buf1[1]; + wordl1[2] = pw_buf1[2]; + wordl1[3] = pw_buf1[3]; + + u32x wordr0[4] = { 0 }; + u32x wordr1[4] = { 0 }; + u32x wordr2[4] = { 0 }; + u32x wordr3[4] = { 0 }; + + wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); + wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); + wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); + wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); + wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); + wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); + wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); + wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + + if (combs_mode == COMBINATOR_MODE_BASE_LEFT) + { + switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len); + } + else + { + switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len); + } + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = wordl0[0] | wordr0[0]; + w0[1] = wordl0[1] | wordr0[1]; + w0[2] = wordl0[2] | wordr0[2]; + w0[3] = wordl0[3] | wordr0[3]; + w1[0] = wordl1[0] | wordr1[0]; + w1[1] = wordl1[1] | wordr1[1]; + w1[2] = wordl1[2] | wordr1[2]; + w1[3] = wordl1[3] | wordr1[3]; + w2[0] = wordl2[0] | wordr2[0]; + w2[1] = wordl2[1] | wordr2[1]; + w2[2] = wordl2[2] | wordr2[2]; + w2[3] = wordl2[3] | wordr2[3]; + w3[0] = wordl3[0] | wordr3[0]; + w3[1] = wordl3[1] | wordr3[1]; + w3[2] = wordl3[2] | wordr3[2]; + w3[3] = wordl3[3] | wordr3[3]; + + /** + * append salt + */ + + const u32x pw_salt_len = pw_len + salt_len; + + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; + u32x w4_t[4]; + u32x w5_t[4]; + u32x w6_t[4]; + u32x w7_t[4]; + + w0_t[0] = salt_buf0[0]; + w0_t[1] = salt_buf0[1]; + w0_t[2] = salt_buf0[2]; + w0_t[3] = salt_buf0[3]; + w1_t[0] = salt_buf1[0]; + w1_t[1] = salt_buf1[1]; + w1_t[2] = salt_buf1[2]; + w1_t[3] = salt_buf1[3]; + w2_t[0] = salt_buf2[0]; + w2_t[1] = salt_buf2[1]; + w2_t[2] = salt_buf2[2]; + w2_t[3] = salt_buf2[3]; + w3_t[0] = salt_buf3[0]; + w3_t[1] = salt_buf3[1]; + w3_t[2] = salt_buf3[2]; + w3_t[3] = salt_buf3[3]; + w4_t[0] = 0x80; + w4_t[1] = 0; + w4_t[2] = 0; + w4_t[3] = 0; + w5_t[0] = 0; + w5_t[1] = 0; + w5_t[2] = 0; + w5_t[3] = 0; + w6_t[0] = 0; + w6_t[1] = 0; + w6_t[2] = 0; + w6_t[3] = 0; + w7_t[0] = 0; + w7_t[1] = 0; + w7_t[2] = 0; + w7_t[3] = 0; + + switch_buffer_by_offset_8x4_le_VV (w0_t, w1_t, w2_t, w3_t, w4_t, w5_t, w6_t, w7_t, pw_len); + + w0_t[0] |= w0[0]; + w0_t[1] |= w0[1]; + w0_t[2] |= w0[2]; + w0_t[3] |= w0[3]; + w1_t[0] |= w1[0]; + w1_t[1] |= w1[1]; + w1_t[2] |= w1[2]; + w1_t[3] |= w1[3]; + w2_t[0] |= w2[0]; + w2_t[1] |= w2[1]; + w2_t[2] |= w2[2]; + w2_t[3] |= w2[3]; + w3_t[0] |= w3[0]; + w3_t[1] |= w3[1]; + w3_t[2] |= w3[2]; + w3_t[3] |= w3[3]; + + w0_t[0] = swap32 (w0_t[0]); + w0_t[1] = swap32 (w0_t[1]); + w0_t[2] = swap32 (w0_t[2]); + w0_t[3] = swap32 (w0_t[3]); + w1_t[0] = swap32 (w1_t[0]); + w1_t[1] = swap32 (w1_t[1]); + w1_t[2] = swap32 (w1_t[2]); + w1_t[3] = swap32 (w1_t[3]); + w2_t[0] = swap32 (w2_t[0]); + w2_t[1] = swap32 (w2_t[1]); + w2_t[2] = swap32 (w2_t[2]); + w2_t[3] = swap32 (w2_t[3]); + w3_t[0] = swap32 (w3_t[0]); + w3_t[1] = swap32 (w3_t[1]); + w3_t[2] = swap32 (w3_t[2]); + w3_t[3] = swap32 (w3_t[3]); + w4_t[0] = swap32 (w4_t[0]); + w4_t[1] = swap32 (w4_t[1]); + w4_t[2] = swap32 (w4_t[2]); + w4_t[3] = swap32 (w4_t[3]); + w5_t[0] = swap32 (w5_t[0]); + w5_t[1] = swap32 (w5_t[1]); + w5_t[2] = swap32 (w5_t[2]); + w5_t[3] = swap32 (w5_t[3]); + w6_t[0] = swap32 (w6_t[0]); + w6_t[1] = swap32 (w6_t[1]); + w6_t[2] = swap32 (w6_t[2]); + w6_t[3] = swap32 (w6_t[3]); + w7_t[0] = swap32 (w7_t[0]); + w7_t[1] = swap32 (w7_t[1]); + w7_t[2] = 0; + w7_t[3] = pw_salt_len * 8; + + /** + * sha512 + */ + + u64x digest[8]; + + digest[0] = SHA512M_A; + digest[1] = SHA512M_B; + digest[2] = SHA512M_C; + digest[3] = SHA512M_D; + digest[4] = SHA512M_E; + digest[5] = SHA512M_F; + digest[6] = SHA512M_G; + digest[7] = SHA512M_H; + + sha512_transform (w0_t, w1_t, w2_t, w3_t, w4_t, w5_t, w6_t, w7_t, digest); + + const u32x r0 = l32_from_64 (digest[7]); + const u32x r1 = h32_from_64 (digest[7]); + const u32x r2 = l32_from_64 (digest[3]); + const u32x r3 = h32_from_64 (digest[3]); + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m15000_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const comb_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ +} + +__kernel void m15000_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const comb_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ +} diff --git a/OpenCL/m15000_a3.cl b/OpenCL/m15000_a3.cl new file mode 100644 index 000000000..a3db83c55 --- /dev/null +++ b/OpenCL/m15000_a3.cl @@ -0,0 +1,2313 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define _SHA512_ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" + +inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) +{ + #if defined IS_AMD || defined IS_GENERIC + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset; + + switch (offset / 4) + { + case 0: + w7[3] = amd_bytealign_S (w7[3], w7[2], offset_minus_4); + w7[2] = amd_bytealign_S (w7[2], w7[1], offset_minus_4); + w7[1] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); + w7[0] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); + w6[3] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); + w6[2] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w6[1] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w6[0] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w5[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w5[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w5[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w5[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w4[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w4[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w4[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w4[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w3[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w3[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + + if (offset_mod_4 == 0) + { + w0[0] = w0[1]; + w0[1] = w0[2]; + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 1: + w7[3] = amd_bytealign_S (w7[2], w7[1], offset_minus_4); + w7[2] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); + w7[1] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); + w7[0] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); + w6[3] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w6[2] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w6[1] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w6[0] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w5[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w5[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w5[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w5[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w4[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w4[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w4[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w4[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w3[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w3[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w0[1] = w0[2]; + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 2: + w7[3] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); + w7[2] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); + w7[1] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); + w7[0] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w6[3] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w6[2] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w6[1] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w6[0] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w5[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w5[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w5[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w5[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w4[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w4[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w4[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w4[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w3[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 3: + w7[3] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); + w7[2] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); + w7[1] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w7[0] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w6[3] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w6[2] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w6[1] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w6[0] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w5[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w5[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w5[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w5[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w4[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w4[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w4[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w4[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 4: + w7[3] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); + w7[2] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w7[1] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w7[0] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w6[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w6[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w6[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w6[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w5[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w5[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w5[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w5[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w4[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w4[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w4[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w4[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 5: + w7[3] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w7[2] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w7[1] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w7[0] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w6[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w6[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w6[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w6[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w5[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w5[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w5[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w5[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w4[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w4[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w4[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w4[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 6: + w7[3] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w7[2] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w7[1] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w7[0] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w6[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w6[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w6[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w6[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w5[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w5[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w5[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w5[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w4[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w4[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w4[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w4[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 7: + w7[3] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w7[2] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w7[1] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w7[0] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w6[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w6[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w6[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w6[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w5[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w5[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w5[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w5[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w4[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w4[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w4[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w4[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 8: + w7[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w7[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w7[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w7[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w6[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w6[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w6[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w6[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w5[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w5[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w5[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w5[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w4[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w4[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w4[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w4[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 9: + w7[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w7[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w7[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w7[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w6[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w6[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w6[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w6[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w5[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w5[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w5[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w5[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w4[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w4[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w4[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w4[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 10: + w7[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w7[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w7[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w7[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w6[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w6[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w6[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w6[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w5[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w5[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w5[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w5[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w4[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w4[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w4[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w4[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 11: + w7[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w7[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w7[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w7[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w6[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w6[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w6[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w6[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w5[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w5[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w5[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w5[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w4[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w4[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w4[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w4[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 12: + w7[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w7[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w7[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w7[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w6[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w6[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w6[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w6[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w5[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w5[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w5[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w5[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w4[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w4[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w4[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w4[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 13: + w7[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w7[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w7[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w7[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w6[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w6[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w6[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w6[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w5[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w5[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w5[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w5[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w4[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w4[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w4[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w4[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 14: + w7[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w7[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w7[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w7[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w6[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w6[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w6[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w6[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w5[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w5[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w5[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w5[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w4[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w4[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w4[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w4[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 15: + w7[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w7[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w7[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w7[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w6[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w6[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w6[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w6[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w5[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w5[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w5[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w5[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w4[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w4[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w4[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w4[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + } + #endif + + #ifdef IS_NV + const int offset_minus_4 = 4 - (offset % 4); + + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + + switch (offset / 4) + { + case 0: + w7[3] = __byte_perm_S (w7[2], w7[3], selector); + w7[2] = __byte_perm_S (w7[1], w7[2], selector); + w7[1] = __byte_perm_S (w7[0], w7[1], selector); + w7[0] = __byte_perm_S (w6[3], w7[0], selector); + w6[3] = __byte_perm_S (w6[2], w6[3], selector); + w6[2] = __byte_perm_S (w6[1], w6[2], selector); + w6[1] = __byte_perm_S (w6[0], w6[1], selector); + w6[0] = __byte_perm_S (w5[3], w6[0], selector); + w5[3] = __byte_perm_S (w5[2], w5[3], selector); + w5[2] = __byte_perm_S (w5[1], w5[2], selector); + w5[1] = __byte_perm_S (w5[0], w5[1], selector); + w5[0] = __byte_perm_S (w4[3], w5[0], selector); + w4[3] = __byte_perm_S (w4[2], w4[3], selector); + w4[2] = __byte_perm_S (w4[1], w4[2], selector); + w4[1] = __byte_perm_S (w4[0], w4[1], selector); + w4[0] = __byte_perm_S (w3[3], w4[0], selector); + w3[3] = __byte_perm_S (w3[2], w3[3], selector); + w3[2] = __byte_perm_S (w3[1], w3[2], selector); + w3[1] = __byte_perm_S (w3[0], w3[1], selector); + w3[0] = __byte_perm_S (w2[3], w3[0], selector); + w2[3] = __byte_perm_S (w2[2], w2[3], selector); + w2[2] = __byte_perm_S (w2[1], w2[2], selector); + w2[1] = __byte_perm_S (w2[0], w2[1], selector); + w2[0] = __byte_perm_S (w1[3], w2[0], selector); + w1[3] = __byte_perm_S (w1[2], w1[3], selector); + w1[2] = __byte_perm_S (w1[1], w1[2], selector); + w1[1] = __byte_perm_S (w1[0], w1[1], selector); + w1[0] = __byte_perm_S (w0[3], w1[0], selector); + w0[3] = __byte_perm_S (w0[2], w0[3], selector); + w0[2] = __byte_perm_S (w0[1], w0[2], selector); + w0[1] = __byte_perm_S (w0[0], w0[1], selector); + w0[0] = __byte_perm_S ( 0, w0[0], selector); + break; + + case 1: + w7[3] = __byte_perm_S (w7[1], w7[2], selector); + w7[2] = __byte_perm_S (w7[0], w7[1], selector); + w7[1] = __byte_perm_S (w6[3], w7[0], selector); + w7[0] = __byte_perm_S (w6[2], w6[3], selector); + w6[3] = __byte_perm_S (w6[1], w6[2], selector); + w6[2] = __byte_perm_S (w6[0], w6[1], selector); + w6[1] = __byte_perm_S (w5[3], w6[0], selector); + w6[0] = __byte_perm_S (w5[2], w5[3], selector); + w5[3] = __byte_perm_S (w5[1], w5[2], selector); + w5[2] = __byte_perm_S (w5[0], w5[1], selector); + w5[1] = __byte_perm_S (w4[3], w5[0], selector); + w5[0] = __byte_perm_S (w4[2], w4[3], selector); + w4[3] = __byte_perm_S (w4[1], w4[2], selector); + w4[2] = __byte_perm_S (w4[0], w4[1], selector); + w4[1] = __byte_perm_S (w3[3], w4[0], selector); + w4[0] = __byte_perm_S (w3[2], w3[3], selector); + w3[3] = __byte_perm_S (w3[1], w3[2], selector); + w3[2] = __byte_perm_S (w3[0], w3[1], selector); + w3[1] = __byte_perm_S (w2[3], w3[0], selector); + w3[0] = __byte_perm_S (w2[2], w2[3], selector); + w2[3] = __byte_perm_S (w2[1], w2[2], selector); + w2[2] = __byte_perm_S (w2[0], w2[1], selector); + w2[1] = __byte_perm_S (w1[3], w2[0], selector); + w2[0] = __byte_perm_S (w1[2], w1[3], selector); + w1[3] = __byte_perm_S (w1[1], w1[2], selector); + w1[2] = __byte_perm_S (w1[0], w1[1], selector); + w1[1] = __byte_perm_S (w0[3], w1[0], selector); + w1[0] = __byte_perm_S (w0[2], w0[3], selector); + w0[3] = __byte_perm_S (w0[1], w0[2], selector); + w0[2] = __byte_perm_S (w0[0], w0[1], selector); + w0[1] = __byte_perm_S ( 0, w0[0], selector); + w0[0] = 0; + break; + + case 2: + w7[3] = __byte_perm_S (w7[0], w7[1], selector); + w7[2] = __byte_perm_S (w6[3], w7[0], selector); + w7[1] = __byte_perm_S (w6[2], w6[3], selector); + w7[0] = __byte_perm_S (w6[1], w6[2], selector); + w6[3] = __byte_perm_S (w6[0], w6[1], selector); + w6[2] = __byte_perm_S (w5[3], w6[0], selector); + w6[1] = __byte_perm_S (w5[2], w5[3], selector); + w6[0] = __byte_perm_S (w5[1], w5[2], selector); + w5[3] = __byte_perm_S (w5[0], w5[1], selector); + w5[2] = __byte_perm_S (w4[3], w5[0], selector); + w5[1] = __byte_perm_S (w4[2], w4[3], selector); + w5[0] = __byte_perm_S (w4[1], w4[2], selector); + w4[3] = __byte_perm_S (w4[0], w4[1], selector); + w4[2] = __byte_perm_S (w3[3], w4[0], selector); + w4[1] = __byte_perm_S (w3[2], w3[3], selector); + w4[0] = __byte_perm_S (w3[1], w3[2], selector); + w3[3] = __byte_perm_S (w3[0], w3[1], selector); + w3[2] = __byte_perm_S (w2[3], w3[0], selector); + w3[1] = __byte_perm_S (w2[2], w2[3], selector); + w3[0] = __byte_perm_S (w2[1], w2[2], selector); + w2[3] = __byte_perm_S (w2[0], w2[1], selector); + w2[2] = __byte_perm_S (w1[3], w2[0], selector); + w2[1] = __byte_perm_S (w1[2], w1[3], selector); + w2[0] = __byte_perm_S (w1[1], w1[2], selector); + w1[3] = __byte_perm_S (w1[0], w1[1], selector); + w1[2] = __byte_perm_S (w0[3], w1[0], selector); + w1[1] = __byte_perm_S (w0[2], w0[3], selector); + w1[0] = __byte_perm_S (w0[1], w0[2], selector); + w0[3] = __byte_perm_S (w0[0], w0[1], selector); + w0[2] = __byte_perm_S ( 0, w0[0], selector); + w0[1] = 0; + w0[0] = 0; + break; + + case 3: + w7[3] = __byte_perm_S (w6[3], w7[0], selector); + w7[2] = __byte_perm_S (w6[2], w6[3], selector); + w7[1] = __byte_perm_S (w6[1], w6[2], selector); + w7[0] = __byte_perm_S (w6[0], w6[1], selector); + w6[3] = __byte_perm_S (w5[3], w6[0], selector); + w6[2] = __byte_perm_S (w5[2], w5[3], selector); + w6[1] = __byte_perm_S (w5[1], w5[2], selector); + w6[0] = __byte_perm_S (w5[0], w5[1], selector); + w5[3] = __byte_perm_S (w4[3], w5[0], selector); + w5[2] = __byte_perm_S (w4[2], w4[3], selector); + w5[1] = __byte_perm_S (w4[1], w4[2], selector); + w5[0] = __byte_perm_S (w4[0], w4[1], selector); + w4[3] = __byte_perm_S (w3[3], w4[0], selector); + w4[2] = __byte_perm_S (w3[2], w3[3], selector); + w4[1] = __byte_perm_S (w3[1], w3[2], selector); + w4[0] = __byte_perm_S (w3[0], w3[1], selector); + w3[3] = __byte_perm_S (w2[3], w3[0], selector); + w3[2] = __byte_perm_S (w2[2], w2[3], selector); + w3[1] = __byte_perm_S (w2[1], w2[2], selector); + w3[0] = __byte_perm_S (w2[0], w2[1], selector); + w2[3] = __byte_perm_S (w1[3], w2[0], selector); + w2[2] = __byte_perm_S (w1[2], w1[3], selector); + w2[1] = __byte_perm_S (w1[1], w1[2], selector); + w2[0] = __byte_perm_S (w1[0], w1[1], selector); + w1[3] = __byte_perm_S (w0[3], w1[0], selector); + w1[2] = __byte_perm_S (w0[2], w0[3], selector); + w1[1] = __byte_perm_S (w0[1], w0[2], selector); + w1[0] = __byte_perm_S (w0[0], w0[1], selector); + w0[3] = __byte_perm_S ( 0, w0[0], selector); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 4: + w7[3] = __byte_perm_S (w6[2], w6[3], selector); + w7[2] = __byte_perm_S (w6[1], w6[2], selector); + w7[1] = __byte_perm_S (w6[0], w6[1], selector); + w7[0] = __byte_perm_S (w5[3], w6[0], selector); + w6[3] = __byte_perm_S (w5[2], w5[3], selector); + w6[2] = __byte_perm_S (w5[1], w5[2], selector); + w6[1] = __byte_perm_S (w5[0], w5[1], selector); + w6[0] = __byte_perm_S (w4[3], w5[0], selector); + w5[3] = __byte_perm_S (w4[2], w4[3], selector); + w5[2] = __byte_perm_S (w4[1], w4[2], selector); + w5[1] = __byte_perm_S (w4[0], w4[1], selector); + w5[0] = __byte_perm_S (w3[3], w4[0], selector); + w4[3] = __byte_perm_S (w3[2], w3[3], selector); + w4[2] = __byte_perm_S (w3[1], w3[2], selector); + w4[1] = __byte_perm_S (w3[0], w3[1], selector); + w4[0] = __byte_perm_S (w2[3], w3[0], selector); + w3[3] = __byte_perm_S (w2[2], w2[3], selector); + w3[2] = __byte_perm_S (w2[1], w2[2], selector); + w3[1] = __byte_perm_S (w2[0], w2[1], selector); + w3[0] = __byte_perm_S (w1[3], w2[0], selector); + w2[3] = __byte_perm_S (w1[2], w1[3], selector); + w2[2] = __byte_perm_S (w1[1], w1[2], selector); + w2[1] = __byte_perm_S (w1[0], w1[1], selector); + w2[0] = __byte_perm_S (w0[3], w1[0], selector); + w1[3] = __byte_perm_S (w0[2], w0[3], selector); + w1[2] = __byte_perm_S (w0[1], w0[2], selector); + w1[1] = __byte_perm_S (w0[0], w0[1], selector); + w1[0] = __byte_perm_S ( 0, w0[0], selector); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 5: + w7[3] = __byte_perm_S (w6[1], w6[2], selector); + w7[2] = __byte_perm_S (w6[0], w6[1], selector); + w7[1] = __byte_perm_S (w5[3], w6[0], selector); + w7[0] = __byte_perm_S (w5[2], w5[3], selector); + w6[3] = __byte_perm_S (w5[1], w5[2], selector); + w6[2] = __byte_perm_S (w5[0], w5[1], selector); + w6[1] = __byte_perm_S (w4[3], w5[0], selector); + w6[0] = __byte_perm_S (w4[2], w4[3], selector); + w5[3] = __byte_perm_S (w4[1], w4[2], selector); + w5[2] = __byte_perm_S (w4[0], w4[1], selector); + w5[1] = __byte_perm_S (w3[3], w4[0], selector); + w5[0] = __byte_perm_S (w3[2], w3[3], selector); + w4[3] = __byte_perm_S (w3[1], w3[2], selector); + w4[2] = __byte_perm_S (w3[0], w3[1], selector); + w4[1] = __byte_perm_S (w2[3], w3[0], selector); + w4[0] = __byte_perm_S (w2[2], w2[3], selector); + w3[3] = __byte_perm_S (w2[1], w2[2], selector); + w3[2] = __byte_perm_S (w2[0], w2[1], selector); + w3[1] = __byte_perm_S (w1[3], w2[0], selector); + w3[0] = __byte_perm_S (w1[2], w1[3], selector); + w2[3] = __byte_perm_S (w1[1], w1[2], selector); + w2[2] = __byte_perm_S (w1[0], w1[1], selector); + w2[1] = __byte_perm_S (w0[3], w1[0], selector); + w2[0] = __byte_perm_S (w0[2], w0[3], selector); + w1[3] = __byte_perm_S (w0[1], w0[2], selector); + w1[2] = __byte_perm_S (w0[0], w0[1], selector); + w1[1] = __byte_perm_S ( 0, w0[0], selector); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 6: + w7[3] = __byte_perm_S (w6[0], w6[1], selector); + w7[2] = __byte_perm_S (w5[3], w6[0], selector); + w7[1] = __byte_perm_S (w5[2], w5[3], selector); + w7[0] = __byte_perm_S (w5[1], w5[2], selector); + w6[3] = __byte_perm_S (w5[0], w5[1], selector); + w6[2] = __byte_perm_S (w4[3], w5[0], selector); + w6[1] = __byte_perm_S (w4[2], w4[3], selector); + w6[0] = __byte_perm_S (w4[1], w4[2], selector); + w5[3] = __byte_perm_S (w4[0], w4[1], selector); + w5[2] = __byte_perm_S (w3[3], w4[0], selector); + w5[1] = __byte_perm_S (w3[2], w3[3], selector); + w5[0] = __byte_perm_S (w3[1], w3[2], selector); + w4[3] = __byte_perm_S (w3[0], w3[1], selector); + w4[2] = __byte_perm_S (w2[3], w3[0], selector); + w4[1] = __byte_perm_S (w2[2], w2[3], selector); + w4[0] = __byte_perm_S (w2[1], w2[2], selector); + w3[3] = __byte_perm_S (w2[0], w2[1], selector); + w3[2] = __byte_perm_S (w1[3], w2[0], selector); + w3[1] = __byte_perm_S (w1[2], w1[3], selector); + w3[0] = __byte_perm_S (w1[1], w1[2], selector); + w2[3] = __byte_perm_S (w1[0], w1[1], selector); + w2[2] = __byte_perm_S (w0[3], w1[0], selector); + w2[1] = __byte_perm_S (w0[2], w0[3], selector); + w2[0] = __byte_perm_S (w0[1], w0[2], selector); + w1[3] = __byte_perm_S (w0[0], w0[1], selector); + w1[2] = __byte_perm_S ( 0, w0[0], selector); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 7: + w7[3] = __byte_perm_S (w5[3], w6[0], selector); + w7[2] = __byte_perm_S (w5[2], w5[3], selector); + w7[1] = __byte_perm_S (w5[1], w5[2], selector); + w7[0] = __byte_perm_S (w5[0], w5[1], selector); + w6[3] = __byte_perm_S (w4[3], w5[0], selector); + w6[2] = __byte_perm_S (w4[2], w4[3], selector); + w6[1] = __byte_perm_S (w4[1], w4[2], selector); + w6[0] = __byte_perm_S (w4[0], w4[1], selector); + w5[3] = __byte_perm_S (w3[3], w4[0], selector); + w5[2] = __byte_perm_S (w3[2], w3[3], selector); + w5[1] = __byte_perm_S (w3[1], w3[2], selector); + w5[0] = __byte_perm_S (w3[0], w3[1], selector); + w4[3] = __byte_perm_S (w2[3], w3[0], selector); + w4[2] = __byte_perm_S (w2[2], w2[3], selector); + w4[1] = __byte_perm_S (w2[1], w2[2], selector); + w4[0] = __byte_perm_S (w2[0], w2[1], selector); + w3[3] = __byte_perm_S (w1[3], w2[0], selector); + w3[2] = __byte_perm_S (w1[2], w1[3], selector); + w3[1] = __byte_perm_S (w1[1], w1[2], selector); + w3[0] = __byte_perm_S (w1[0], w1[1], selector); + w2[3] = __byte_perm_S (w0[3], w1[0], selector); + w2[2] = __byte_perm_S (w0[2], w0[3], selector); + w2[1] = __byte_perm_S (w0[1], w0[2], selector); + w2[0] = __byte_perm_S (w0[0], w0[1], selector); + w1[3] = __byte_perm_S ( 0, w0[0], selector); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 8: + w7[3] = __byte_perm_S (w5[2], w5[3], selector); + w7[2] = __byte_perm_S (w5[1], w5[2], selector); + w7[1] = __byte_perm_S (w5[0], w5[1], selector); + w7[0] = __byte_perm_S (w4[3], w5[0], selector); + w6[3] = __byte_perm_S (w4[2], w4[3], selector); + w6[2] = __byte_perm_S (w4[1], w4[2], selector); + w6[1] = __byte_perm_S (w4[0], w4[1], selector); + w6[0] = __byte_perm_S (w3[3], w4[0], selector); + w5[3] = __byte_perm_S (w3[2], w3[3], selector); + w5[2] = __byte_perm_S (w3[1], w3[2], selector); + w5[1] = __byte_perm_S (w3[0], w3[1], selector); + w5[0] = __byte_perm_S (w2[3], w3[0], selector); + w4[3] = __byte_perm_S (w2[2], w2[3], selector); + w4[2] = __byte_perm_S (w2[1], w2[2], selector); + w4[1] = __byte_perm_S (w2[0], w2[1], selector); + w4[0] = __byte_perm_S (w1[3], w2[0], selector); + w3[3] = __byte_perm_S (w1[2], w1[3], selector); + w3[2] = __byte_perm_S (w1[1], w1[2], selector); + w3[1] = __byte_perm_S (w1[0], w1[1], selector); + w3[0] = __byte_perm_S (w0[3], w1[0], selector); + w2[3] = __byte_perm_S (w0[2], w0[3], selector); + w2[2] = __byte_perm_S (w0[1], w0[2], selector); + w2[1] = __byte_perm_S (w0[0], w0[1], selector); + w2[0] = __byte_perm_S ( 0, w0[0], selector); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 9: + w7[3] = __byte_perm_S (w5[1], w5[2], selector); + w7[2] = __byte_perm_S (w5[0], w5[1], selector); + w7[1] = __byte_perm_S (w4[3], w5[0], selector); + w7[0] = __byte_perm_S (w4[2], w4[3], selector); + w6[3] = __byte_perm_S (w4[1], w4[2], selector); + w6[2] = __byte_perm_S (w4[0], w4[1], selector); + w6[1] = __byte_perm_S (w3[3], w4[0], selector); + w6[0] = __byte_perm_S (w3[2], w3[3], selector); + w5[3] = __byte_perm_S (w3[1], w3[2], selector); + w5[2] = __byte_perm_S (w3[0], w3[1], selector); + w5[1] = __byte_perm_S (w2[3], w3[0], selector); + w5[0] = __byte_perm_S (w2[2], w2[3], selector); + w4[3] = __byte_perm_S (w2[1], w2[2], selector); + w4[2] = __byte_perm_S (w2[0], w2[1], selector); + w4[1] = __byte_perm_S (w1[3], w2[0], selector); + w4[0] = __byte_perm_S (w1[2], w1[3], selector); + w3[3] = __byte_perm_S (w1[1], w1[2], selector); + w3[2] = __byte_perm_S (w1[0], w1[1], selector); + w3[1] = __byte_perm_S (w0[3], w1[0], selector); + w3[0] = __byte_perm_S (w0[2], w0[3], selector); + w2[3] = __byte_perm_S (w0[1], w0[2], selector); + w2[2] = __byte_perm_S (w0[0], w0[1], selector); + w2[1] = __byte_perm_S ( 0, w0[0], selector); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 10: + w7[3] = __byte_perm_S (w5[0], w5[1], selector); + w7[2] = __byte_perm_S (w4[3], w5[0], selector); + w7[1] = __byte_perm_S (w4[2], w4[3], selector); + w7[0] = __byte_perm_S (w4[1], w4[2], selector); + w6[3] = __byte_perm_S (w4[0], w4[1], selector); + w6[2] = __byte_perm_S (w3[3], w4[0], selector); + w6[1] = __byte_perm_S (w3[2], w3[3], selector); + w6[0] = __byte_perm_S (w3[1], w3[2], selector); + w5[3] = __byte_perm_S (w3[0], w3[1], selector); + w5[2] = __byte_perm_S (w2[3], w3[0], selector); + w5[1] = __byte_perm_S (w2[2], w2[3], selector); + w5[0] = __byte_perm_S (w2[1], w2[2], selector); + w4[3] = __byte_perm_S (w2[0], w2[1], selector); + w4[2] = __byte_perm_S (w1[3], w2[0], selector); + w4[1] = __byte_perm_S (w1[2], w1[3], selector); + w4[0] = __byte_perm_S (w1[1], w1[2], selector); + w3[3] = __byte_perm_S (w1[0], w1[1], selector); + w3[2] = __byte_perm_S (w0[3], w1[0], selector); + w3[1] = __byte_perm_S (w0[2], w0[3], selector); + w3[0] = __byte_perm_S (w0[1], w0[2], selector); + w2[3] = __byte_perm_S (w0[0], w0[1], selector); + w2[2] = __byte_perm_S ( 0, w0[0], selector); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 11: + w7[3] = __byte_perm_S (w4[3], w5[0], selector); + w7[2] = __byte_perm_S (w4[2], w4[3], selector); + w7[1] = __byte_perm_S (w4[1], w4[2], selector); + w7[0] = __byte_perm_S (w4[0], w4[1], selector); + w6[3] = __byte_perm_S (w3[3], w4[0], selector); + w6[2] = __byte_perm_S (w3[2], w3[3], selector); + w6[1] = __byte_perm_S (w3[1], w3[2], selector); + w6[0] = __byte_perm_S (w3[0], w3[1], selector); + w5[3] = __byte_perm_S (w2[3], w3[0], selector); + w5[2] = __byte_perm_S (w2[2], w2[3], selector); + w5[1] = __byte_perm_S (w2[1], w2[2], selector); + w5[0] = __byte_perm_S (w2[0], w2[1], selector); + w4[3] = __byte_perm_S (w1[3], w2[0], selector); + w4[2] = __byte_perm_S (w1[2], w1[3], selector); + w4[1] = __byte_perm_S (w1[1], w1[2], selector); + w4[0] = __byte_perm_S (w1[0], w1[1], selector); + w3[3] = __byte_perm_S (w0[3], w1[0], selector); + w3[2] = __byte_perm_S (w0[2], w0[3], selector); + w3[1] = __byte_perm_S (w0[1], w0[2], selector); + w3[0] = __byte_perm_S (w0[0], w0[1], selector); + w2[3] = __byte_perm_S ( 0, w0[0], selector); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 12: + w7[3] = __byte_perm_S (w4[2], w4[3], selector); + w7[2] = __byte_perm_S (w4[1], w4[2], selector); + w7[1] = __byte_perm_S (w4[0], w4[1], selector); + w7[0] = __byte_perm_S (w3[3], w4[0], selector); + w6[3] = __byte_perm_S (w3[2], w3[3], selector); + w6[2] = __byte_perm_S (w3[1], w3[2], selector); + w6[1] = __byte_perm_S (w3[0], w3[1], selector); + w6[0] = __byte_perm_S (w2[3], w3[0], selector); + w5[3] = __byte_perm_S (w2[2], w2[3], selector); + w5[2] = __byte_perm_S (w2[1], w2[2], selector); + w5[1] = __byte_perm_S (w2[0], w2[1], selector); + w5[0] = __byte_perm_S (w1[3], w2[0], selector); + w4[3] = __byte_perm_S (w1[2], w1[3], selector); + w4[2] = __byte_perm_S (w1[1], w1[2], selector); + w4[1] = __byte_perm_S (w1[0], w1[1], selector); + w4[0] = __byte_perm_S (w0[3], w1[0], selector); + w3[3] = __byte_perm_S (w0[2], w0[3], selector); + w3[2] = __byte_perm_S (w0[1], w0[2], selector); + w3[1] = __byte_perm_S (w0[0], w0[1], selector); + w3[0] = __byte_perm_S ( 0, w0[0], selector); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 13: + w7[3] = __byte_perm_S (w4[1], w4[2], selector); + w7[2] = __byte_perm_S (w4[0], w4[1], selector); + w7[1] = __byte_perm_S (w3[3], w4[0], selector); + w7[0] = __byte_perm_S (w3[2], w3[3], selector); + w6[3] = __byte_perm_S (w3[1], w3[2], selector); + w6[2] = __byte_perm_S (w3[0], w3[1], selector); + w6[1] = __byte_perm_S (w2[3], w3[0], selector); + w6[0] = __byte_perm_S (w2[2], w2[3], selector); + w5[3] = __byte_perm_S (w2[1], w2[2], selector); + w5[2] = __byte_perm_S (w2[0], w2[1], selector); + w5[1] = __byte_perm_S (w1[3], w2[0], selector); + w5[0] = __byte_perm_S (w1[2], w1[3], selector); + w4[3] = __byte_perm_S (w1[1], w1[2], selector); + w4[2] = __byte_perm_S (w1[0], w1[1], selector); + w4[1] = __byte_perm_S (w0[3], w1[0], selector); + w4[0] = __byte_perm_S (w0[2], w0[3], selector); + w3[3] = __byte_perm_S (w0[1], w0[2], selector); + w3[2] = __byte_perm_S (w0[0], w0[1], selector); + w3[1] = __byte_perm_S ( 0, w0[0], selector); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 14: + w7[3] = __byte_perm_S (w4[0], w4[1], selector); + w7[2] = __byte_perm_S (w3[3], w4[0], selector); + w7[1] = __byte_perm_S (w3[2], w3[3], selector); + w7[0] = __byte_perm_S (w3[1], w3[2], selector); + w6[3] = __byte_perm_S (w3[0], w3[1], selector); + w6[2] = __byte_perm_S (w2[3], w3[0], selector); + w6[1] = __byte_perm_S (w2[2], w2[3], selector); + w6[0] = __byte_perm_S (w2[1], w2[2], selector); + w5[3] = __byte_perm_S (w2[0], w2[1], selector); + w5[2] = __byte_perm_S (w1[3], w2[0], selector); + w5[1] = __byte_perm_S (w1[2], w1[3], selector); + w5[0] = __byte_perm_S (w1[1], w1[2], selector); + w4[3] = __byte_perm_S (w1[0], w1[1], selector); + w4[2] = __byte_perm_S (w0[3], w1[0], selector); + w4[1] = __byte_perm_S (w0[2], w0[3], selector); + w4[0] = __byte_perm_S (w0[1], w0[2], selector); + w3[3] = __byte_perm_S (w0[0], w0[1], selector); + w3[2] = __byte_perm_S ( 0, w0[0], selector); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 15: + w7[3] = __byte_perm_S (w3[3], w4[0], selector); + w7[2] = __byte_perm_S (w3[2], w3[3], selector); + w7[1] = __byte_perm_S (w3[1], w3[2], selector); + w7[0] = __byte_perm_S (w3[0], w3[1], selector); + w6[3] = __byte_perm_S (w2[3], w3[0], selector); + w6[2] = __byte_perm_S (w2[2], w2[3], selector); + w6[1] = __byte_perm_S (w2[1], w2[2], selector); + w6[0] = __byte_perm_S (w2[0], w2[1], selector); + w5[3] = __byte_perm_S (w1[3], w2[0], selector); + w5[2] = __byte_perm_S (w1[2], w1[3], selector); + w5[1] = __byte_perm_S (w1[1], w1[2], selector); + w5[0] = __byte_perm_S (w1[0], w1[1], selector); + w4[3] = __byte_perm_S (w0[3], w1[0], selector); + w4[2] = __byte_perm_S (w0[2], w0[3], selector); + w4[1] = __byte_perm_S (w0[1], w0[2], selector); + w4[0] = __byte_perm_S (w0[0], w0[1], selector); + w3[3] = __byte_perm_S ( 0, w0[0], selector); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + } + #endif +} + +__constant u64 k_sha512[80] = +{ + SHA512C00, SHA512C01, SHA512C02, SHA512C03, + SHA512C04, SHA512C05, SHA512C06, SHA512C07, + SHA512C08, SHA512C09, SHA512C0a, SHA512C0b, + SHA512C0c, SHA512C0d, SHA512C0e, SHA512C0f, + SHA512C10, SHA512C11, SHA512C12, SHA512C13, + SHA512C14, SHA512C15, SHA512C16, SHA512C17, + SHA512C18, SHA512C19, SHA512C1a, SHA512C1b, + SHA512C1c, SHA512C1d, SHA512C1e, SHA512C1f, + SHA512C20, SHA512C21, SHA512C22, SHA512C23, + SHA512C24, SHA512C25, SHA512C26, SHA512C27, + SHA512C28, SHA512C29, SHA512C2a, SHA512C2b, + SHA512C2c, SHA512C2d, SHA512C2e, SHA512C2f, + SHA512C30, SHA512C31, SHA512C32, SHA512C33, + SHA512C34, SHA512C35, SHA512C36, SHA512C37, + SHA512C38, SHA512C39, SHA512C3a, SHA512C3b, + SHA512C3c, SHA512C3d, SHA512C3e, SHA512C3f, + SHA512C40, SHA512C41, SHA512C42, SHA512C43, + SHA512C44, SHA512C45, SHA512C46, SHA512C47, + SHA512C48, SHA512C49, SHA512C4a, SHA512C4b, + SHA512C4c, SHA512C4d, SHA512C4e, SHA512C4f, +}; + +static void sha512_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], const u32x w4[4], const u32x w5[4], const u32x w6[4], const u32x w7[4], u64x digest[8]) +{ + u64x w0_t = hl32_to_64 (w0[0], w0[1]); + u64x w1_t = hl32_to_64 (w0[2], w0[3]); + u64x w2_t = hl32_to_64 (w1[0], w1[1]); + u64x w3_t = hl32_to_64 (w1[2], w1[3]); + u64x w4_t = hl32_to_64 (w2[0], w2[1]); + u64x w5_t = hl32_to_64 (w2[2], w2[3]); + u64x w6_t = hl32_to_64 (w3[0], w3[1]); + u64x w7_t = hl32_to_64 (w3[2], w3[3]); + u64x w8_t = hl32_to_64 (w4[0], w4[1]); + u64x w9_t = hl32_to_64 (w4[2], w4[3]); + u64x wa_t = hl32_to_64 (w5[0], w5[1]); + u64x wb_t = hl32_to_64 (w5[2], w5[3]); + u64x wc_t = hl32_to_64 (w6[0], w6[1]); + u64x wd_t = hl32_to_64 (w6[2], w6[3]); + u64x we_t = hl32_to_64 (w7[0], w7[1]); + u64x wf_t = hl32_to_64 (w7[2], w7[3]); + + u64x a = digest[0]; + u64x b = digest[1]; + u64x c = digest[2]; + u64x d = digest[3]; + u64x e = digest[4]; + u64x f = digest[5]; + u64x g = digest[6]; + u64x h = digest[7]; + + #define ROUND_EXPAND() \ + { \ + w0_t = SHA512_EXPAND (we_t, w9_t, w1_t, w0_t); \ + w1_t = SHA512_EXPAND (wf_t, wa_t, w2_t, w1_t); \ + w2_t = SHA512_EXPAND (w0_t, wb_t, w3_t, w2_t); \ + w3_t = SHA512_EXPAND (w1_t, wc_t, w4_t, w3_t); \ + w4_t = SHA512_EXPAND (w2_t, wd_t, w5_t, w4_t); \ + w5_t = SHA512_EXPAND (w3_t, we_t, w6_t, w5_t); \ + w6_t = SHA512_EXPAND (w4_t, wf_t, w7_t, w6_t); \ + w7_t = SHA512_EXPAND (w5_t, w0_t, w8_t, w7_t); \ + w8_t = SHA512_EXPAND (w6_t, w1_t, w9_t, w8_t); \ + w9_t = SHA512_EXPAND (w7_t, w2_t, wa_t, w9_t); \ + wa_t = SHA512_EXPAND (w8_t, w3_t, wb_t, wa_t); \ + wb_t = SHA512_EXPAND (w9_t, w4_t, wc_t, wb_t); \ + wc_t = SHA512_EXPAND (wa_t, w5_t, wd_t, wc_t); \ + wd_t = SHA512_EXPAND (wb_t, w6_t, we_t, wd_t); \ + we_t = SHA512_EXPAND (wc_t, w7_t, wf_t, we_t); \ + wf_t = SHA512_EXPAND (wd_t, w8_t, w0_t, wf_t); \ + } + + #define ROUND_STEP(i) \ + { \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, a, b, c, d, e, f, g, h, w0_t, k_sha512[i + 0]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, h, a, b, c, d, e, f, g, w1_t, k_sha512[i + 1]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, g, h, a, b, c, d, e, f, w2_t, k_sha512[i + 2]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, f, g, h, a, b, c, d, e, w3_t, k_sha512[i + 3]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, e, f, g, h, a, b, c, d, w4_t, k_sha512[i + 4]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, d, e, f, g, h, a, b, c, w5_t, k_sha512[i + 5]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, c, d, e, f, g, h, a, b, w6_t, k_sha512[i + 6]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, b, c, d, e, f, g, h, a, w7_t, k_sha512[i + 7]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, a, b, c, d, e, f, g, h, w8_t, k_sha512[i + 8]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, h, a, b, c, d, e, f, g, w9_t, k_sha512[i + 9]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, g, h, a, b, c, d, e, f, wa_t, k_sha512[i + 10]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, f, g, h, a, b, c, d, e, wb_t, k_sha512[i + 11]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, e, f, g, h, a, b, c, d, wc_t, k_sha512[i + 12]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, d, e, f, g, h, a, b, c, wd_t, k_sha512[i + 13]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, c, d, e, f, g, h, a, b, we_t, k_sha512[i + 14]); \ + SHA512_STEP (SHA512_F0o, SHA512_F1o, b, c, d, e, f, g, h, a, wf_t, k_sha512[i + 15]); \ + } + + ROUND_STEP (0); + + #ifdef _unroll + #pragma unroll + #endif + for (int i = 16; i < 80; i += 16) + { + ROUND_EXPAND (); ROUND_STEP (i); + } + + /* rev + digest[0] += a; + digest[1] += b; + digest[2] += c; + digest[3] += d; + digest[4] += e; + digest[5] += f; + digest[6] += g; + digest[7] += h; + */ + + digest[0] = a; + digest[1] = b; + digest[2] = c; + digest[3] = d; + digest[4] = e; + digest[5] = f; + digest[6] = g; + digest[7] = h; +} + +static void m15000m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + + /** + * salt + */ + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + const u32 pw_salt_len = pw_len + salt_len; + + u32 salt_buf0[4]; + u32 salt_buf1[4]; + u32 salt_buf2[4]; + u32 salt_buf3[4]; + u32 salt_buf4[4]; + u32 salt_buf5[4]; + u32 salt_buf6[4]; + u32 salt_buf7[4]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; + salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; + salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; + salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; + salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; + salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; + salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; + salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; + salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; + salt_buf4[0] = 0x80; + salt_buf4[1] = 0; + salt_buf4[2] = 0; + salt_buf4[3] = 0; + salt_buf5[0] = 0; + salt_buf5[1] = 0; + salt_buf5[2] = 0; + salt_buf5[3] = 0; + salt_buf6[0] = 0; + salt_buf6[1] = 0; + salt_buf6[2] = 0; + salt_buf6[3] = 0; + salt_buf7[0] = 0; + salt_buf7[1] = 0; + salt_buf7[2] = 0; + salt_buf7[3] = 0; + + switch_buffer_by_offset_8x4_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, salt_buf4, salt_buf5, salt_buf6, salt_buf7, pw_len); + + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; + u32x w4_t[4]; + u32x w5_t[4]; + u32x w6_t[4]; + u32x w7_t[4]; + + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); + w2_t[0] = swap32 (salt_buf2[0]); + w2_t[1] = swap32 (salt_buf2[1]); + w2_t[2] = swap32 (salt_buf2[2]); + w2_t[3] = swap32 (salt_buf2[3]); + w3_t[0] = swap32 (salt_buf3[0]); + w3_t[1] = swap32 (salt_buf3[1]); + w3_t[2] = swap32 (salt_buf3[2]); + w3_t[3] = swap32 (salt_buf3[3]); + w4_t[0] = swap32 (salt_buf4[0]); + w4_t[1] = swap32 (salt_buf4[1]); + w4_t[2] = swap32 (salt_buf4[2]); + w4_t[3] = swap32 (salt_buf4[3]); + w5_t[0] = swap32 (salt_buf5[0]); + w5_t[1] = swap32 (salt_buf5[1]); + w5_t[2] = swap32 (salt_buf5[2]); + w5_t[3] = swap32 (salt_buf5[3]); + w6_t[0] = swap32 (salt_buf6[0]); + w6_t[1] = swap32 (salt_buf6[1]); + w6_t[2] = swap32 (salt_buf6[2]); + w6_t[3] = swap32 (salt_buf6[3]); + w7_t[0] = swap32 (salt_buf7[0]); + w7_t[1] = swap32 (salt_buf7[1]); + w7_t[2] = swap32 (salt_buf7[2]); + w7_t[3] = swap32 (salt_buf7[3]); + + w0_t[0] |= w[ 0]; + w0_t[1] |= w[ 1]; + w0_t[2] |= w[ 2]; + w0_t[3] |= w[ 3]; + w1_t[0] |= w[ 4]; + w1_t[1] |= w[ 5]; + w1_t[2] |= w[ 6]; + w1_t[3] |= w[ 7]; + w2_t[0] |= w[ 8]; + w2_t[1] |= w[ 9]; + w2_t[2] |= w[10]; + w2_t[3] |= w[11]; + w3_t[0] |= w[12]; + w3_t[1] |= w[13]; + w3_t[2] |= w[14]; + w3_t[3] |= w[15]; + + w7_t[3] = pw_salt_len * 8; + + /** + * loop + */ + + u32x w0l = w0_t[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w0_t[0] = w0; + + /* + * sha512 + */ + + u64x digest[8]; + + digest[0] = SHA512M_A; + digest[1] = SHA512M_B; + digest[2] = SHA512M_C; + digest[3] = SHA512M_D; + digest[4] = SHA512M_E; + digest[5] = SHA512M_F; + digest[6] = SHA512M_G; + digest[7] = SHA512M_H; + + sha512_transform (w0_t, w1_t, w2_t, w3_t, w4_t, w5_t, w6_t, w7_t, digest); + + const u32x r0 = l32_from_64 (digest[7]); + const u32x r1 = h32_from_64 (digest[7]); + const u32x r2 = l32_from_64 (digest[3]); + const u32x r3 = h32_from_64 (digest[3]); + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +static void m15000s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * salt + */ + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + const u32 pw_salt_len = pw_len + salt_len; + + u32 salt_buf0[4]; + u32 salt_buf1[4]; + u32 salt_buf2[4]; + u32 salt_buf3[4]; + u32 salt_buf4[4]; + u32 salt_buf5[4]; + u32 salt_buf6[4]; + u32 salt_buf7[4]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; + salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; + salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; + salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; + salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; + salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; + salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; + salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; + salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15]; + salt_buf4[0] = 0x80; + salt_buf4[1] = 0; + salt_buf4[2] = 0; + salt_buf4[3] = 0; + salt_buf5[0] = 0; + salt_buf5[1] = 0; + salt_buf5[2] = 0; + salt_buf5[3] = 0; + salt_buf6[0] = 0; + salt_buf6[1] = 0; + salt_buf6[2] = 0; + salt_buf6[3] = 0; + salt_buf7[0] = 0; + salt_buf7[1] = 0; + salt_buf7[2] = 0; + salt_buf7[3] = 0; + + switch_buffer_by_offset_8x4_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, salt_buf4, salt_buf5, salt_buf6, salt_buf7, pw_len); + + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; + u32x w4_t[4]; + u32x w5_t[4]; + u32x w6_t[4]; + u32x w7_t[4]; + + w0_t[0] = swap32 (salt_buf0[0]); + w0_t[1] = swap32 (salt_buf0[1]); + w0_t[2] = swap32 (salt_buf0[2]); + w0_t[3] = swap32 (salt_buf0[3]); + w1_t[0] = swap32 (salt_buf1[0]); + w1_t[1] = swap32 (salt_buf1[1]); + w1_t[2] = swap32 (salt_buf1[2]); + w1_t[3] = swap32 (salt_buf1[3]); + w2_t[0] = swap32 (salt_buf2[0]); + w2_t[1] = swap32 (salt_buf2[1]); + w2_t[2] = swap32 (salt_buf2[2]); + w2_t[3] = swap32 (salt_buf2[3]); + w3_t[0] = swap32 (salt_buf3[0]); + w3_t[1] = swap32 (salt_buf3[1]); + w3_t[2] = swap32 (salt_buf3[2]); + w3_t[3] = swap32 (salt_buf3[3]); + w4_t[0] = swap32 (salt_buf4[0]); + w4_t[1] = swap32 (salt_buf4[1]); + w4_t[2] = swap32 (salt_buf4[2]); + w4_t[3] = swap32 (salt_buf4[3]); + w5_t[0] = swap32 (salt_buf5[0]); + w5_t[1] = swap32 (salt_buf5[1]); + w5_t[2] = swap32 (salt_buf5[2]); + w5_t[3] = swap32 (salt_buf5[3]); + w6_t[0] = swap32 (salt_buf6[0]); + w6_t[1] = swap32 (salt_buf6[1]); + w6_t[2] = swap32 (salt_buf6[2]); + w6_t[3] = swap32 (salt_buf6[3]); + w7_t[0] = swap32 (salt_buf7[0]); + w7_t[1] = swap32 (salt_buf7[1]); + w7_t[2] = swap32 (salt_buf7[2]); + w7_t[3] = swap32 (salt_buf7[3]); + + w0_t[0] |= w[ 0]; + w0_t[1] |= w[ 1]; + w0_t[2] |= w[ 2]; + w0_t[3] |= w[ 3]; + w1_t[0] |= w[ 4]; + w1_t[1] |= w[ 5]; + w1_t[2] |= w[ 6]; + w1_t[3] |= w[ 7]; + w2_t[0] |= w[ 8]; + w2_t[1] |= w[ 9]; + w2_t[2] |= w[10]; + w2_t[3] |= w[11]; + w3_t[0] |= w[12]; + w3_t[1] |= w[13]; + w3_t[2] |= w[14]; + w3_t[3] |= w[15]; + + w7_t[3] = pw_salt_len * 8; + + /** + * loop + */ + + u32x w0l = w0_t[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w0_t[0] = w0; + + /* + * sha512 + */ + + u64x digest[8]; + + digest[0] = SHA512M_A; + digest[1] = SHA512M_B; + digest[2] = SHA512M_C; + digest[3] = SHA512M_D; + digest[4] = SHA512M_E; + digest[5] = SHA512M_F; + digest[6] = SHA512M_G; + digest[7] = SHA512M_H; + + sha512_transform (w0_t, w1_t, w2_t, w3_t, w4_t, w5_t, w6_t, w7_t, digest); + + const u32x r0 = l32_from_64 (digest[7]); + const u32x r1 = h32_from_64 (digest[7]); + const u32x r2 = l32_from_64 (digest[3]); + const u32x r3 = h32_from_64 (digest[3]); + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m15000_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * base + */ + + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w[16]; + + w[ 0] = pws[gid].i[ 0]; + w[ 1] = pws[gid].i[ 1]; + w[ 2] = pws[gid].i[ 2]; + w[ 3] = pws[gid].i[ 3]; + w[ 4] = 0; + w[ 5] = 0; + w[ 6] = 0; + w[ 7] = 0; + w[ 8] = 0; + w[ 9] = 0; + w[10] = 0; + w[11] = 0; + w[12] = 0; + w[13] = 0; + w[14] = 0; + w[15] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m15000m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m15000_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * base + */ + + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w[16]; + + w[ 0] = pws[gid].i[ 0]; + w[ 1] = pws[gid].i[ 1]; + w[ 2] = pws[gid].i[ 2]; + w[ 3] = pws[gid].i[ 3]; + w[ 4] = pws[gid].i[ 4]; + w[ 5] = pws[gid].i[ 5]; + w[ 6] = pws[gid].i[ 6]; + w[ 7] = pws[gid].i[ 7]; + w[ 8] = 0; + w[ 9] = 0; + w[10] = 0; + w[11] = 0; + w[12] = 0; + w[13] = 0; + w[14] = 0; + w[15] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m15000m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m15000_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * base + */ + + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w[16]; + + w[ 0] = pws[gid].i[ 0]; + w[ 1] = pws[gid].i[ 1]; + w[ 2] = pws[gid].i[ 2]; + w[ 3] = pws[gid].i[ 3]; + w[ 4] = pws[gid].i[ 4]; + w[ 5] = pws[gid].i[ 5]; + w[ 6] = pws[gid].i[ 6]; + w[ 7] = pws[gid].i[ 7]; + w[ 8] = pws[gid].i[ 8]; + w[ 9] = pws[gid].i[ 9]; + w[10] = pws[gid].i[10]; + w[11] = pws[gid].i[11]; + w[12] = pws[gid].i[12]; + w[13] = pws[gid].i[13]; + w[14] = pws[gid].i[14]; + w[15] = pws[gid].i[15]; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m15000m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m15000_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * base + */ + + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w[16]; + + w[ 0] = pws[gid].i[ 0]; + w[ 1] = pws[gid].i[ 1]; + w[ 2] = pws[gid].i[ 2]; + w[ 3] = pws[gid].i[ 3]; + w[ 4] = 0; + w[ 5] = 0; + w[ 6] = 0; + w[ 7] = 0; + w[ 8] = 0; + w[ 9] = 0; + w[10] = 0; + w[11] = 0; + w[12] = 0; + w[13] = 0; + w[14] = 0; + w[15] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m15000s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m15000_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * base + */ + + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w[16]; + + w[ 0] = pws[gid].i[ 0]; + w[ 1] = pws[gid].i[ 1]; + w[ 2] = pws[gid].i[ 2]; + w[ 3] = pws[gid].i[ 3]; + w[ 4] = pws[gid].i[ 4]; + w[ 5] = pws[gid].i[ 5]; + w[ 6] = pws[gid].i[ 6]; + w[ 7] = pws[gid].i[ 7]; + w[ 8] = 0; + w[ 9] = 0; + w[10] = 0; + w[11] = 0; + w[12] = 0; + w[13] = 0; + w[14] = 0; + w[15] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m15000s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m15000_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * base + */ + + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w[16]; + + w[ 0] = pws[gid].i[ 0]; + w[ 1] = pws[gid].i[ 1]; + w[ 2] = pws[gid].i[ 2]; + w[ 3] = pws[gid].i[ 3]; + w[ 4] = pws[gid].i[ 4]; + w[ 5] = pws[gid].i[ 5]; + w[ 6] = pws[gid].i[ 6]; + w[ 7] = pws[gid].i[ 7]; + w[ 8] = pws[gid].i[ 8]; + w[ 9] = pws[gid].i[ 9]; + w[10] = pws[gid].i[10]; + w[11] = pws[gid].i[11]; + w[12] = pws[gid].i[12]; + w[13] = pws[gid].i[13]; + w[14] = pws[gid].i[14]; + w[15] = pws[gid].i[15]; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m15000s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} diff --git a/docs/changes.txt b/docs/changes.txt index 039ef5510..29da622a7 100644 --- a/docs/changes.txt +++ b/docs/changes.txt @@ -22,6 +22,7 @@ - Added hash-mode 14700 = iTunes Backup < 10.0 - Added hash-mode 14800 = iTunes Backup >= 10.0 - Added hash-mode 14900 = Skip32 +- Added hash-mode 15000 = FileZilla Server >= 0.9.55 ## ## Workarounds diff --git a/docs/readme.txt b/docs/readme.txt index 842cf3ee1..11bf6604e 100644 --- a/docs/readme.txt +++ b/docs/readme.txt @@ -138,6 +138,7 @@ NVidia users require "NVIDIA Driver" (367.x or later) - Kerberos 5 AS-REQ Pre-Auth etype 23 - Kerberos 5 TGS-REP etype 23 - Netscape LDAP SHA/SSHA +- FileZilla Server - LM - NTLM - Domain Cached Credentials (DCC), MS Cache diff --git a/extra/tab_completion/hashcat.sh b/extra/tab_completion/hashcat.sh index 1bca43344..ae759b093 100644 --- a/extra/tab_completion/hashcat.sh +++ b/extra/tab_completion/hashcat.sh @@ -176,7 +176,7 @@ _hashcat () { local VERSION=3.30 - local HASH_MODES="0 10 11 12 20 21 22 23 30 40 50 60 100 101 110 111 112 120 121 122 124 130 131 132 133 140 141 150 160 200 300 400 500 501 900 1000 1100 1400 1410 1411 1420 1421 1430 1440 1441 1450 1460 1500 1600 1700 1710 1711 1720 1722 1730 1731 1740 1750 1760 1800 2100 2400 2410 2500 2600 2611 2612 2711 2811 3000 3100 3200 3710 3711 3800 4010 4110 4300 4400 4500 4520 4521 4522 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 6000 6100 6211 6212 6213 6221 6222 6223 6231 6232 6233 6241 6242 6243 6300 6400 6500 6600 6700 6800 6900 7000 7100 7200 7300 7400 7500 7700 7800 7900 8000 8100 8200 8300 8400 8500 8600 8700 8800 8900 9000 9100 9200 9300 9400 9500 9600 9700 9710 9720 9800 9810 9820 9900 10000 10100 10200 10300 10400 10410 10420 10500 10600 10700 10800 10900 11000 11100 11200 11300 11400 11500 11600 11700 11800 11900 12000 12100 12200 12300 12400 12500 12600 12700 12800 12900 13000 13100 13200 13300 13400 13500 13600 13800 13900 14000 14100 14700 14800 14900" + local HASH_MODES="0 10 11 12 20 21 22 23 30 40 50 60 100 101 110 111 112 120 121 122 124 130 131 132 133 140 141 150 160 200 300 400 500 501 900 1000 1100 1400 1410 1411 1420 1421 1430 1440 1441 1450 1460 1500 1600 1700 1710 1711 1720 1722 1730 1731 1740 1750 1760 1800 2100 2400 2410 2500 2600 2611 2612 2711 2811 3000 3100 3200 3710 3711 3800 4010 4110 4300 4400 4500 4520 4521 4522 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 6000 6100 6211 6212 6213 6221 6222 6223 6231 6232 6233 6241 6242 6243 6300 6400 6500 6600 6700 6800 6900 7000 7100 7200 7300 7400 7500 7700 7800 7900 8000 8100 8200 8300 8400 8500 8600 8700 8800 8900 9000 9100 9200 9300 9400 9500 9600 9700 9710 9720 9800 9810 9820 9900 10000 10100 10200 10300 10400 10410 10420 10500 10600 10700 10800 10900 11000 11100 11200 11300 11400 11500 11600 11700 11800 11900 12000 12100 12200 12300 12400 12500 12600 12700 12800 12900 13000 13100 13200 13300 13400 13500 13600 13800 13900 14000 14100 14700 14800 14900 15000" local ATTACK_MODES="0 1 3 6 7" local OUTFILE_FORMATS="1 2 3 4 5 6 7 8 9 10 11 12 13 14 15" local OPENCL_DEVICE_TYPES="1 2 3" diff --git a/include/interface.h b/include/interface.h index 11fc7d5d5..08c0d4d03 100644 --- a/include/interface.h +++ b/include/interface.h @@ -1142,6 +1142,8 @@ typedef enum display_len DISPLAY_LEN_MAX_14700 = 15 + 1 + 2 + 1 + 80 + 1 + 6 + 1 + 40 + 1 + 9 + 1 + 40, DISPLAY_LEN_MIN_14900 = 8 + 1 + 8, DISPLAY_LEN_MAX_14900 = 8 + 1 + 8, + DISPLAY_LEN_MIN_15000 = 128 + 1 + 64, + DISPLAY_LEN_MAX_15000 = 128 + 1 + 64, DISPLAY_LEN_MIN_99999 = 1, DISPLAY_LEN_MAX_99999 = 55, @@ -1463,6 +1465,7 @@ typedef enum kern_type KERN_TYPE_ITUNES_BACKUP_9 = 14700, KERN_TYPE_ITUNES_BACKUP_10 = 14800, KERN_TYPE_SKIP32 = 14900, + KERN_TYPE_FILEZILLA_SERVER = 15000, KERN_TYPE_PLAINTEXT = 99999, } kern_type_t; @@ -1703,6 +1706,7 @@ int itunes_backup_parse_hash (u8 *input_buf, u32 input_len, hash_t *hash_bu int skip32_parse_hash (u8 *input_buf, u32 input_len, hash_t *hash_buf, MAYBE_UNUSED const hashconfig_t *hashconfig); int fortigate_parse_hash (u8 *input_buf, u32 input_len, hash_t *hash_buf, MAYBE_UNUSED const hashconfig_t *hashconfig); int sha256b64s_parse_hash (u8 *input_buf, u32 input_len, hash_t *hash_buf, MAYBE_UNUSED const hashconfig_t *hashconfig); +int filezilla_server_parse_hash (u8 *input_buf, u32 input_len, hash_t *hash_buf, MAYBE_UNUSED const hashconfig_t *hashconfig); /** * hook functions diff --git a/src/interface.c b/src/interface.c index d39d6ddd0..05fb40082 100644 --- a/src/interface.c +++ b/src/interface.c @@ -233,6 +233,7 @@ static const char HT_14600[] = "LUKS"; static const char HT_14700[] = "iTunes Backup < 10.0"; static const char HT_14800[] = "iTunes Backup >= 10.0"; static const char HT_14900[] = "Skip32"; +static const char HT_15000[] = "FileZilla Server >= 0.9.55"; static const char HT_99999[] = "Plaintext"; static const char HT_00011[] = "Joomla < 2.5.18"; @@ -14026,6 +14027,60 @@ int sha256b64s_parse_hash (u8 *input_buf, u32 input_len, hash_t *hash_buf, MAYBE return (PARSER_OK); } +int filezilla_server_parse_hash (u8 *input_buf, u32 input_len, hash_t *hash_buf, MAYBE_UNUSED const hashconfig_t *hashconfig) +{ + if ((input_len < DISPLAY_LEN_MIN_15000) || (input_len > DISPLAY_LEN_MAX_15000)) return (PARSER_GLOBAL_LENGTH); + + u64 *digest = (u64 *) hash_buf->digest; + + salt_t *salt = hash_buf->salt; + + if (is_valid_hex_string (input_buf, 128) == false) return (PARSER_HASH_ENCODING); + + digest[0] = hex_to_u64 ((const u8 *) &input_buf[ 0]); + digest[1] = hex_to_u64 ((const u8 *) &input_buf[ 16]); + digest[2] = hex_to_u64 ((const u8 *) &input_buf[ 32]); + digest[3] = hex_to_u64 ((const u8 *) &input_buf[ 48]); + digest[4] = hex_to_u64 ((const u8 *) &input_buf[ 64]); + digest[5] = hex_to_u64 ((const u8 *) &input_buf[ 80]); + digest[6] = hex_to_u64 ((const u8 *) &input_buf[ 96]); + digest[7] = hex_to_u64 ((const u8 *) &input_buf[112]); + + digest[0] = byte_swap_64 (digest[0]); + digest[1] = byte_swap_64 (digest[1]); + digest[2] = byte_swap_64 (digest[2]); + digest[3] = byte_swap_64 (digest[3]); + digest[4] = byte_swap_64 (digest[4]); + digest[5] = byte_swap_64 (digest[5]); + digest[6] = byte_swap_64 (digest[6]); + digest[7] = byte_swap_64 (digest[7]); + + digest[0] -= SHA512M_A; + digest[1] -= SHA512M_B; + digest[2] -= SHA512M_C; + digest[3] -= SHA512M_D; + digest[4] -= SHA512M_E; + digest[5] -= SHA512M_F; + digest[6] -= SHA512M_G; + digest[7] -= SHA512M_H; + + if (input_buf[128] != hashconfig->separator) return (PARSER_SEPARATOR_UNMATCHED); + + u32 salt_len = input_len - 128 - 1; + + u8 *salt_buf = input_buf + 128 + 1; + + u8 *salt_buf_ptr = (u8 *) salt->salt_buf; + + salt_len = parse_and_store_salt (salt_buf_ptr, salt_buf, salt_len, hashconfig); + + if (salt_len == UINT_MAX) return (PARSER_SALT_LENGTH); + + salt->salt_len = salt_len; + + return (PARSER_OK); +} + /** * hook functions */ @@ -14618,6 +14673,7 @@ char *strhashtype (const u32 hash_mode) case 14700: return ((char *) HT_14700); case 14800: return ((char *) HT_14800); case 14900: return ((char *) HT_14900); + case 15000: return ((char *) HT_15000); case 99999: return ((char *) HT_99999); } @@ -21844,6 +21900,26 @@ int hashconfig_init (hashcat_ctx_t *hashcat_ctx) hashconfig->dgst_pos3 = 3; break; + case 15000: hashconfig->hash_type = HASH_TYPE_SHA512; + hashconfig->salt_type = SALT_TYPE_INTERN; + hashconfig->attack_exec = ATTACK_EXEC_INSIDE_KERNEL; + hashconfig->opts_type = OPTS_TYPE_PT_GENERATE_BE; // OPTS_TYPE_ST_ADD80 added within kernel + hashconfig->kern_type = KERN_TYPE_FILEZILLA_SERVER; + hashconfig->dgst_size = DGST_SIZE_8_8; + hashconfig->parse_func = filezilla_server_parse_hash; + hashconfig->opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_PRECOMPUTE_INIT + | OPTI_TYPE_PRECOMPUTE_MERKLE + | OPTI_TYPE_EARLY_SKIP + | OPTI_TYPE_NOT_ITERATED + | OPTI_TYPE_USES_BITS_64 + | OPTI_TYPE_RAW_HASH; + hashconfig->dgst_pos0 = 14; + hashconfig->dgst_pos1 = 15; + hashconfig->dgst_pos2 = 6; + hashconfig->dgst_pos3 = 7; + break; + case 99999: hashconfig->hash_type = HASH_TYPE_PLAINTEXT; hashconfig->salt_type = SALT_TYPE_NONE; hashconfig->attack_exec = ATTACK_EXEC_INSIDE_KERNEL; diff --git a/src/usage.c b/src/usage.c index 3c39c39a3..4a02b68cf 100644 --- a/src/usage.c +++ b/src/usage.c @@ -229,6 +229,7 @@ static const char *USAGE_BIG[] = " 111 | nsldaps, SSHA-1(Base64), Netscape LDAP SSHA | HTTP, SMTP, LDAP Server", " 1411 | SSHA-256(Base64), LDAP {SSHA256} | HTTP, SMTP, LDAP Server", " 1711 | SSHA-512(Base64), LDAP {SSHA512} | HTTP, SMTP, LDAP Server", + " 15000 | FileZilla Server >= 0.9.55 | FTP Server", " 11500 | CRC32 | Checksums", " 3000 | LM | Operating-Systems", " 1000 | NTLM | Operating-Systems", diff --git a/tools/test.pl b/tools/test.pl index 2f36a3486..a4aa859de 100755 --- a/tools/test.pl +++ b/tools/test.pl @@ -46,11 +46,11 @@ my $hashcat = "./hashcat"; my $MAX_LEN = 55; -my @modes = (0, 10, 11, 12, 20, 21, 22, 23, 30, 40, 50, 60, 100, 101, 110, 111, 112, 120, 121, 122, 125, 130, 131, 132, 133, 140, 141, 150, 160, 200, 300, 400, 500, 900, 1000, 1100, 1300, 1400, 1410, 1411, 1420, 1430, 1440, 1441, 1450, 1460, 1500, 1600, 1700, 1710, 1711, 1720, 1730, 1740, 1722, 1731, 1750, 1760, 1800, 2100, 2400, 2410, 2500, 2600, 2611, 2612, 2711, 2811, 3000, 3100, 3200, 3710, 3711, 3300, 3500, 3610, 3720, 3800, 3910, 4010, 4110, 4210, 4300, 4400, 4500, 4520, 4521, 4522, 4600, 4700, 4800, 4900, 5000, 5100, 5300, 5400, 5500, 5600, 5700, 5800, 6000, 6100, 6300, 6400, 6500, 6600, 6700, 6800, 6900, 7000, 7100, 7200, 7300, 7400, 7500, 7700, 7800, 7900, 8000, 8100, 8200, 8300, 8400, 8500, 8600, 8700, 8900, 9100, 9200, 9300, 9400, 9500, 9600, 9700, 9800, 9900, 10000, 10100, 10200, 10300, 10400, 10500, 10600, 10700, 10800, 10900, 11000, 11100, 11200, 11300, 11400, 11500, 11600, 11900, 12000, 12100, 12200, 12300, 12400, 12600, 12700, 12800, 12900, 13000, 13100, 13200, 13300, 13400, 13500, 13600, 13800, 13900, 14000, 14100, 14400, 14700, 14800, 14900, 99999); +my @modes = (0, 10, 11, 12, 20, 21, 22, 23, 30, 40, 50, 60, 100, 101, 110, 111, 112, 120, 121, 122, 125, 130, 131, 132, 133, 140, 141, 150, 160, 200, 300, 400, 500, 900, 1000, 1100, 1300, 1400, 1410, 1411, 1420, 1430, 1440, 1441, 1450, 1460, 1500, 1600, 1700, 1710, 1711, 1720, 1730, 1740, 1722, 1731, 1750, 1760, 1800, 2100, 2400, 2410, 2500, 2600, 2611, 2612, 2711, 2811, 3000, 3100, 3200, 3710, 3711, 3300, 3500, 3610, 3720, 3800, 3910, 4010, 4110, 4210, 4300, 4400, 4500, 4520, 4521, 4522, 4600, 4700, 4800, 4900, 5000, 5100, 5300, 5400, 5500, 5600, 5700, 5800, 6000, 6100, 6300, 6400, 6500, 6600, 6700, 6800, 6900, 7000, 7100, 7200, 7300, 7400, 7500, 7700, 7800, 7900, 8000, 8100, 8200, 8300, 8400, 8500, 8600, 8700, 8900, 9100, 9200, 9300, 9400, 9500, 9600, 9700, 9800, 9900, 10000, 10100, 10200, 10300, 10400, 10500, 10600, 10700, 10800, 10900, 11000, 11100, 11200, 11300, 11400, 11500, 11600, 11900, 12000, 12100, 12200, 12300, 12400, 12600, 12700, 12800, 12900, 13000, 13100, 13200, 13300, 13400, 13500, 13600, 13800, 13900, 14000, 14100, 14400, 14700, 14800, 14900, 15000, 99999); my %is_unicode = map { $_ => 1 } qw(30 40 130 131 132 133 140 141 1000 1100 1430 1440 1441 1730 1740 1731 5500 5600 8000 9400 9500 9600 9700 9800 11600 13500 13800); my %less_fifteen = map { $_ => 1 } qw(500 1600 1800 2400 2410 3200 6300 7400 10500 10700); -my %allow_long_salt = map { $_ => 1 } qw(2500 4520 4521 5500 5600 7100 7200 7300 9400 9500 9600 9700 9800 10400 10500 10600 10700 1100 11000 11200 11300 11400 11600 12600 13500 13800); +my %allow_long_salt = map { $_ => 1 } qw(2500 4520 4521 5500 5600 7100 7200 7300 9400 9500 9600 9700 9800 10400 10500 10600 10700 1100 11000 11200 11300 11400 11600 12600 13500 13800 15000); my @lotus_magic_table = ( @@ -214,7 +214,7 @@ sub verify # remember always do "exists ($db->{$hash_in})" checks as soon as possible and don't forget it # unsalted - if ($mode == 0 || $mode == 100 || $mode == 101 || $mode == 133 || $mode == 200 || $mode == 300 || $mode == 900 || $mode == 1000 || $mode == 1300 || $mode == 1400 || $mode == 1700 || $mode == 2400 || $mode == 2600 || $mode == 3000 || $mode == 3500 || $mode == 4300 || $mode == 4400 || $mode == 4500 || $mode == 4600 || $mode == 4700 || $mode == 5000 || $mode == 5100 || $mode == 5700 || $mode == 6000 || $mode == 6100 || $mode == 6900 || $mode == 8600 || $mode == 9900 || $mode == 10800 || $mode == 11500 || $mode == 99999) + if ($mode == 0 || $mode == 100 || $mode == 101 || $mode == 133 || $mode == 200 || $mode == 300 || $mode == 900 || $mode == 1000 || $mode == 1300 || $mode == 1400 || $mode == 1700 || $mode == 2400 || $mode == 2600 || $mode == 3000 || $mode == 3500 || $mode == 4300 || $mode == 4400 || $mode == 4500 || $mode == 4600 || $mode == 4700 || $mode == 5000 || $mode == 5100 || $mode == 5700 || $mode == 6000 || $mode == 6100 || $mode == 6900 || $mode == 8600 || $mode == 9900 || $mode == 10800 || $mode == 11500 || $mode == 15000 || $mode == 99999) { my $index = index ($line, ":"); @@ -3288,7 +3288,7 @@ sub passthrough { $tmp_hash = gen_hash ($mode, $word_buf, substr ($salt_buf, 0, 4)); } - elsif ($mode == 12600) + elsif ($mode == 12600 || $mode == 15000) { $tmp_hash = gen_hash ($mode, $word_buf, substr ($salt_buf, 0, 64)); } @@ -4087,7 +4087,7 @@ sub single } } } - elsif ($mode == 12600) + elsif ($mode == 12600 || $mode == 15000) { for (my $i = 1; $i < 32; $i++) { @@ -4740,7 +4740,7 @@ sub gen_hash $tmp_hash = sprintf ("%s", $hash_buf); } - elsif ($mode == 1710) + elsif ($mode == 1710 || $mode == 15000) { $hash_buf = sha512_hex ($word_buf . $salt_buf); diff --git a/tools/test.sh b/tools/test.sh index 140068bbd..2489ec071 100755 --- a/tools/test.sh +++ b/tools/test.sh @@ -9,7 +9,7 @@ TDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" # missing hash types: 5200,6251,6261,6271,6281 -HASH_TYPES="0 10 11 12 20 21 22 23 30 40 50 60 100 101 110 111 112 120 121 122 125 130 131 132 133 140 141 150 160 200 300 400 500 900 1000 1100 1300 1400 1410 1411 1420 1430 1440 1441 1450 1460 1500 1600 1700 1710 1711 1720 1722 1730 1731 1740 1750 1760 1800 2100 2400 2410 2500 2600 2611 2612 2711 2811 3000 3100 3200 3710 3711 3800 4010 4110 4300 4400 4500 4520 4521 4522 4700 4800 4900 5000 5100 5300 5400 5500 5600 5700 5800 6000 6100 6211 6212 6213 6221 6222 6223 6231 6232 6233 6241 6242 6243 6300 6400 6500 6600 6700 6800 6900 7000 7100 7200 7300 7400 7500 7700 7800 7900 8000 8100 8200 8300 8400 8500 8600 8700 8900 9100 9200 9300 9400 9500 9600 9700 9800 9900 10000 10100 10200 10300 10400 10500 10600 10700 10800 10900 11000 11100 11200 11300 11400 11500 11600 11900 12000 12100 12200 12300 12400 12600 12800 12900 13000 13100 13200 13300 13400 13500 13600 13800 14000 14100 14400 14600 14700 14800 14900 99999" +HASH_TYPES="0 10 11 12 20 21 22 23 30 40 50 60 100 101 110 111 112 120 121 122 125 130 131 132 133 140 141 150 160 200 300 400 500 900 1000 1100 1300 1400 1410 1411 1420 1430 1440 1441 1450 1460 1500 1600 1700 1710 1711 1720 1722 1730 1731 1740 1750 1760 1800 2100 2400 2410 2500 2600 2611 2612 2711 2811 3000 3100 3200 3710 3711 3800 4010 4110 4300 4400 4500 4520 4521 4522 4700 4800 4900 5000 5100 5300 5400 5500 5600 5700 5800 6000 6100 6211 6212 6213 6221 6222 6223 6231 6232 6233 6241 6242 6243 6300 6400 6500 6600 6700 6800 6900 7000 7100 7200 7300 7400 7500 7700 7800 7900 8000 8100 8200 8300 8400 8500 8600 8700 8900 9100 9200 9300 9400 9500 9600 9700 9800 9900 10000 10100 10200 10300 10400 10500 10600 10700 10800 10900 11000 11100 11200 11300 11400 11500 11600 11900 12000 12100 12200 12300 12400 12600 12800 12900 13000 13100 13200 13300 13400 13500 13600 13800 14000 14100 14400 14600 14700 14800 14900 99999 15000" #ATTACK_MODES="0 1 3 6 7" ATTACK_MODES="0 1 3 7"