From e210d608f8ffb9484a81c1f9f6eb84f15e98c2c8 Mon Sep 17 00:00:00 2001 From: Royce Williams Date: Thu, 18 Jun 2020 20:58:30 -0800 Subject: [PATCH 1/8] switch to preferred forms for _NSAKEY, Minga --- docs/team.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/team.txt b/docs/team.txt index d0ce79285..0a6e4c771 100644 --- a/docs/team.txt +++ b/docs/team.txt @@ -36,7 +36,7 @@ BlowCane chancas Chick3nman coolbry95 dakykilla deutsch dropdead epixoip EvilMog franky gpufreak hashtka Hydraze J0hnnyBrav0 K9 kontrast23 Kryczek legion m3g9tr0n matrix -minga N|IGHT5 NSAKEY NullMode philsmd +Minga N|IGHT5 _NSAKEY NullMode philsmd purehate radix Rolf rurapenthe s3in!c SuperJames Szul tehnlulz The_Mechanic T0XlC TychoTithonus undeath unix-ninja Xanadrel xmisery From b09bebae3c16788603a00a9a86a6fb1af7b972e1 Mon Sep 17 00:00:00 2001 From: Royce Williams Date: Tue, 23 Jun 2020 12:10:47 -0800 Subject: [PATCH 2/8] make 'Applicable optimizers' less ambiguous --- src/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.c b/src/main.c index 7e045eb6a..48cf59e27 100644 --- a/src/main.c +++ b/src/main.c @@ -471,7 +471,7 @@ static void main_outerloop_mainscreen (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, if (hashconfig->opti_type) { - event_log_info (hashcat_ctx, "Applicable optimizers:"); + event_log_info (hashcat_ctx, "Applicable optimizers applied:"); for (u32 i = 0; i < 32; i++) { From bd9304724c4d82afdee70c3a74f1eaab8f58cbf2 Mon Sep 17 00:00:00 2001 From: philsmd Date: Wed, 24 Jun 2020 23:41:58 +0200 Subject: [PATCH 3/8] fixes #1298: add pure kernels for -m 600 = BLAKE2b-512 --- OpenCL/inc_common.cl | 6776 +++++++++++++++++++++++++++++++++ OpenCL/inc_common.h | 2 + OpenCL/inc_hash_blake2b.cl | 662 ++++ OpenCL/inc_hash_blake2b.h | 90 + OpenCL/m00600_a0-optimized.cl | 280 +- OpenCL/m00600_a0-pure.cl | 111 + OpenCL/m00600_a1-optimized.cl | 281 +- OpenCL/m00600_a1-pure.cl | 109 + OpenCL/m00600_a3-optimized.cl | 946 ++--- OpenCL/m00600_a3-pure.cl | 131 + docs/changes.txt | 6 + src/modules/module_00600.c | 39 +- tools/test_modules/m00600.pm | 2 +- 13 files changed, 8324 insertions(+), 1111 deletions(-) create mode 100644 OpenCL/inc_hash_blake2b.cl create mode 100644 OpenCL/inc_hash_blake2b.h create mode 100644 OpenCL/m00600_a0-pure.cl create mode 100644 OpenCL/m00600_a1-pure.cl create mode 100644 OpenCL/m00600_a3-pure.cl diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl index 407a24ef6..5c34a824f 100644 --- a/OpenCL/inc_common.cl +++ b/OpenCL/inc_common.cl @@ -7499,6 +7499,3394 @@ DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x #endif } +DECLSPEC void switch_buffer_by_offset_8x4_carry_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *w4, u32x *w5, u32x *w6, u32x *w7, u32x *c0, u32x *c1, u32x *c2, u32x *c3, u32x *c4, u32x *c5, u32x *c6, u32x *c7, const u32 offset) +{ + const int offset_switch = offset / 4; + + #if (defined IS_AMD && HAS_VPERM == 0) || defined IS_GENERIC + switch (offset_switch) + { + case 0: + c0[0] = hc_bytealign (w7[3], 0, offset); + w7[3] = hc_bytealign (w7[2], w7[3], offset); + w7[2] = hc_bytealign (w7[1], w7[2], offset); + w7[1] = hc_bytealign (w7[0], w7[1], offset); + w7[0] = hc_bytealign (w6[3], w7[0], offset); + w6[3] = hc_bytealign (w6[2], w6[3], offset); + w6[2] = hc_bytealign (w6[1], w6[2], offset); + w6[1] = hc_bytealign (w6[0], w6[1], offset); + w6[0] = hc_bytealign (w5[3], w6[0], offset); + w5[3] = hc_bytealign (w5[2], w5[3], offset); + w5[2] = hc_bytealign (w5[1], w5[2], offset); + w5[1] = hc_bytealign (w5[0], w5[1], offset); + w5[0] = hc_bytealign (w4[3], w5[0], offset); + w4[3] = hc_bytealign (w4[2], w4[3], offset); + w4[2] = hc_bytealign (w4[1], w4[2], offset); + w4[1] = hc_bytealign (w4[0], w4[1], offset); + w4[0] = hc_bytealign (w3[3], w4[0], offset); + w3[3] = hc_bytealign (w3[2], w3[3], offset); + w3[2] = hc_bytealign (w3[1], w3[2], offset); + w3[1] = hc_bytealign (w3[0], w3[1], offset); + w3[0] = hc_bytealign (w2[3], w3[0], offset); + w2[3] = hc_bytealign (w2[2], w2[3], offset); + w2[2] = hc_bytealign (w2[1], w2[2], offset); + w2[1] = hc_bytealign (w2[0], w2[1], offset); + w2[0] = hc_bytealign (w1[3], w2[0], offset); + w1[3] = hc_bytealign (w1[2], w1[3], offset); + w1[2] = hc_bytealign (w1[1], w1[2], offset); + w1[1] = hc_bytealign (w1[0], w1[1], offset); + w1[0] = hc_bytealign (w0[3], w1[0], offset); + w0[3] = hc_bytealign (w0[2], w0[3], offset); + w0[2] = hc_bytealign (w0[1], w0[2], offset); + w0[1] = hc_bytealign (w0[0], w0[1], offset); + w0[0] = hc_bytealign ( 0, w0[0], offset); + + break; + + case 1: + c0[1] = hc_bytealign (w7[3], 0, offset); + c0[0] = hc_bytealign (w7[2], w7[3], offset); + w7[3] = hc_bytealign (w7[1], w7[2], offset); + w7[2] = hc_bytealign (w7[0], w7[1], offset); + w7[1] = hc_bytealign (w6[3], w7[0], offset); + w7[0] = hc_bytealign (w6[2], w6[3], offset); + w6[3] = hc_bytealign (w6[1], w6[2], offset); + w6[2] = hc_bytealign (w6[0], w6[1], offset); + w6[1] = hc_bytealign (w5[3], w6[0], offset); + w6[0] = hc_bytealign (w5[2], w5[3], offset); + w5[3] = hc_bytealign (w5[1], w5[2], offset); + w5[2] = hc_bytealign (w5[0], w5[1], offset); + w5[1] = hc_bytealign (w4[3], w5[0], offset); + w5[0] = hc_bytealign (w4[2], w4[3], offset); + w4[3] = hc_bytealign (w4[1], w4[2], offset); + w4[2] = hc_bytealign (w4[0], w4[1], offset); + w4[1] = hc_bytealign (w3[3], w4[0], offset); + w4[0] = hc_bytealign (w3[2], w3[3], offset); + w3[3] = hc_bytealign (w3[1], w3[2], offset); + w3[2] = hc_bytealign (w3[0], w3[1], offset); + w3[1] = hc_bytealign (w2[3], w3[0], offset); + w3[0] = hc_bytealign (w2[2], w2[3], offset); + w2[3] = hc_bytealign (w2[1], w2[2], offset); + w2[2] = hc_bytealign (w2[0], w2[1], offset); + w2[1] = hc_bytealign (w1[3], w2[0], offset); + w2[0] = hc_bytealign (w1[2], w1[3], offset); + w1[3] = hc_bytealign (w1[1], w1[2], offset); + w1[2] = hc_bytealign (w1[0], w1[1], offset); + w1[1] = hc_bytealign (w0[3], w1[0], offset); + w1[0] = hc_bytealign (w0[2], w0[3], offset); + w0[3] = hc_bytealign (w0[1], w0[2], offset); + w0[2] = hc_bytealign (w0[0], w0[1], offset); + w0[1] = hc_bytealign ( 0, w0[0], offset); + w0[0] = 0; + + break; + + case 2: + c0[2] = hc_bytealign (w7[3], 0, offset); + c0[1] = hc_bytealign (w7[2], w7[3], offset); + c0[0] = hc_bytealign (w7[1], w7[2], offset); + w7[3] = hc_bytealign (w7[0], w7[1], offset); + w7[2] = hc_bytealign (w6[3], w7[0], offset); + w7[1] = hc_bytealign (w6[2], w6[3], offset); + w7[0] = hc_bytealign (w6[1], w6[2], offset); + w6[3] = hc_bytealign (w6[0], w6[1], offset); + w6[2] = hc_bytealign (w5[3], w6[0], offset); + w6[1] = hc_bytealign (w5[2], w5[3], offset); + w6[0] = hc_bytealign (w5[1], w5[2], offset); + w5[3] = hc_bytealign (w5[0], w5[1], offset); + w5[2] = hc_bytealign (w4[3], w5[0], offset); + w5[1] = hc_bytealign (w4[2], w4[3], offset); + w5[0] = hc_bytealign (w4[1], w4[2], offset); + w4[3] = hc_bytealign (w4[0], w4[1], offset); + w4[2] = hc_bytealign (w3[3], w4[0], offset); + w4[1] = hc_bytealign (w3[2], w3[3], offset); + w4[0] = hc_bytealign (w3[1], w3[2], offset); + w3[3] = hc_bytealign (w3[0], w3[1], offset); + w3[2] = hc_bytealign (w2[3], w3[0], offset); + w3[1] = hc_bytealign (w2[2], w2[3], offset); + w3[0] = hc_bytealign (w2[1], w2[2], offset); + w2[3] = hc_bytealign (w2[0], w2[1], offset); + w2[2] = hc_bytealign (w1[3], w2[0], offset); + w2[1] = hc_bytealign (w1[2], w1[3], offset); + w2[0] = hc_bytealign (w1[1], w1[2], offset); + w1[3] = hc_bytealign (w1[0], w1[1], offset); + w1[2] = hc_bytealign (w0[3], w1[0], offset); + w1[1] = hc_bytealign (w0[2], w0[3], offset); + w1[0] = hc_bytealign (w0[1], w0[2], offset); + w0[3] = hc_bytealign (w0[0], w0[1], offset); + w0[2] = hc_bytealign ( 0, w0[0], offset); + w0[1] = 0; + w0[0] = 0; + + break; + + case 3: + c0[3] = hc_bytealign (w7[3], 0, offset); + c0[2] = hc_bytealign (w7[2], w7[3], offset); + c0[1] = hc_bytealign (w7[1], w7[2], offset); + c0[0] = hc_bytealign (w7[0], w7[1], offset); + w7[3] = hc_bytealign (w6[3], w7[0], offset); + w7[2] = hc_bytealign (w6[2], w6[3], offset); + w7[1] = hc_bytealign (w6[1], w6[2], offset); + w7[0] = hc_bytealign (w6[0], w6[1], offset); + w6[3] = hc_bytealign (w5[3], w6[0], offset); + w6[2] = hc_bytealign (w5[2], w5[3], offset); + w6[1] = hc_bytealign (w5[1], w5[2], offset); + w6[0] = hc_bytealign (w5[0], w5[1], offset); + w5[3] = hc_bytealign (w4[3], w5[0], offset); + w5[2] = hc_bytealign (w4[2], w4[3], offset); + w5[1] = hc_bytealign (w4[1], w4[2], offset); + w5[0] = hc_bytealign (w4[0], w4[1], offset); + w4[3] = hc_bytealign (w3[3], w4[0], offset); + w4[2] = hc_bytealign (w3[2], w3[3], offset); + w4[1] = hc_bytealign (w3[1], w3[2], offset); + w4[0] = hc_bytealign (w3[0], w3[1], offset); + w3[3] = hc_bytealign (w2[3], w3[0], offset); + w3[2] = hc_bytealign (w2[2], w2[3], offset); + w3[1] = hc_bytealign (w2[1], w2[2], offset); + w3[0] = hc_bytealign (w2[0], w2[1], offset); + w2[3] = hc_bytealign (w1[3], w2[0], offset); + w2[2] = hc_bytealign (w1[2], w1[3], offset); + w2[1] = hc_bytealign (w1[1], w1[2], offset); + w2[0] = hc_bytealign (w1[0], w1[1], offset); + w1[3] = hc_bytealign (w0[3], w1[0], offset); + w1[2] = hc_bytealign (w0[2], w0[3], offset); + w1[1] = hc_bytealign (w0[1], w0[2], offset); + w1[0] = hc_bytealign (w0[0], w0[1], offset); + w0[3] = hc_bytealign ( 0, w0[0], offset); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 4: + c1[0] = hc_bytealign (w7[3], 0, offset); + c0[3] = hc_bytealign (w7[2], w7[3], offset); + c0[2] = hc_bytealign (w7[1], w7[2], offset); + c0[1] = hc_bytealign (w7[0], w7[1], offset); + c0[0] = hc_bytealign (w6[3], w7[0], offset); + w7[3] = hc_bytealign (w6[2], w6[3], offset); + w7[2] = hc_bytealign (w6[1], w6[2], offset); + w7[1] = hc_bytealign (w6[0], w6[1], offset); + w7[0] = hc_bytealign (w5[3], w6[0], offset); + w6[3] = hc_bytealign (w5[2], w5[3], offset); + w6[2] = hc_bytealign (w5[1], w5[2], offset); + w6[1] = hc_bytealign (w5[0], w5[1], offset); + w6[0] = hc_bytealign (w4[3], w5[0], offset); + w5[3] = hc_bytealign (w4[2], w4[3], offset); + w5[2] = hc_bytealign (w4[1], w4[2], offset); + w5[1] = hc_bytealign (w4[0], w4[1], offset); + w5[0] = hc_bytealign (w3[3], w4[0], offset); + w4[3] = hc_bytealign (w3[2], w3[3], offset); + w4[2] = hc_bytealign (w3[1], w3[2], offset); + w4[1] = hc_bytealign (w3[0], w3[1], offset); + w4[0] = hc_bytealign (w2[3], w3[0], offset); + w3[3] = hc_bytealign (w2[2], w2[3], offset); + w3[2] = hc_bytealign (w2[1], w2[2], offset); + w3[1] = hc_bytealign (w2[0], w2[1], offset); + w3[0] = hc_bytealign (w1[3], w2[0], offset); + w2[3] = hc_bytealign (w1[2], w1[3], offset); + w2[2] = hc_bytealign (w1[1], w1[2], offset); + w2[1] = hc_bytealign (w1[0], w1[1], offset); + w2[0] = hc_bytealign (w0[3], w1[0], offset); + w1[3] = hc_bytealign (w0[2], w0[3], offset); + w1[2] = hc_bytealign (w0[1], w0[2], offset); + w1[1] = hc_bytealign (w0[0], w0[1], offset); + w1[0] = hc_bytealign ( 0, w0[0], offset); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 5: + c1[1] = hc_bytealign (w7[3], 0, offset); + c1[0] = hc_bytealign (w7[2], w7[3], offset); + c0[3] = hc_bytealign (w7[1], w7[2], offset); + c0[2] = hc_bytealign (w7[0], w7[1], offset); + c0[1] = hc_bytealign (w6[3], w7[0], offset); + c0[0] = hc_bytealign (w6[2], w6[3], offset); + w7[3] = hc_bytealign (w6[1], w6[2], offset); + w7[2] = hc_bytealign (w6[0], w6[1], offset); + w7[1] = hc_bytealign (w5[3], w6[0], offset); + w7[0] = hc_bytealign (w5[2], w5[3], offset); + w6[3] = hc_bytealign (w5[1], w5[2], offset); + w6[2] = hc_bytealign (w5[0], w5[1], offset); + w6[1] = hc_bytealign (w4[3], w5[0], offset); + w6[0] = hc_bytealign (w4[2], w4[3], offset); + w5[3] = hc_bytealign (w4[1], w4[2], offset); + w5[2] = hc_bytealign (w4[0], w4[1], offset); + w5[1] = hc_bytealign (w3[3], w4[0], offset); + w5[0] = hc_bytealign (w3[2], w3[3], offset); + w4[3] = hc_bytealign (w3[1], w3[2], offset); + w4[2] = hc_bytealign (w3[0], w3[1], offset); + w4[1] = hc_bytealign (w2[3], w3[0], offset); + w4[0] = hc_bytealign (w2[2], w2[3], offset); + w3[3] = hc_bytealign (w2[1], w2[2], offset); + w3[2] = hc_bytealign (w2[0], w2[1], offset); + w3[1] = hc_bytealign (w1[3], w2[0], offset); + w3[0] = hc_bytealign (w1[2], w1[3], offset); + w2[3] = hc_bytealign (w1[1], w1[2], offset); + w2[2] = hc_bytealign (w1[0], w1[1], offset); + w2[1] = hc_bytealign (w0[3], w1[0], offset); + w2[0] = hc_bytealign (w0[2], w0[3], offset); + w1[3] = hc_bytealign (w0[1], w0[2], offset); + w1[2] = hc_bytealign (w0[0], w0[1], offset); + w1[1] = hc_bytealign ( 0, w0[0], offset); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 6: + c1[2] = hc_bytealign (w7[3], 0, offset); + c1[1] = hc_bytealign (w7[2], w7[3], offset); + c1[0] = hc_bytealign (w7[1], w7[2], offset); + c0[3] = hc_bytealign (w7[0], w7[1], offset); + c0[2] = hc_bytealign (w6[3], w7[0], offset); + c0[1] = hc_bytealign (w6[2], w6[3], offset); + c0[0] = hc_bytealign (w6[1], w6[2], offset); + w7[3] = hc_bytealign (w6[0], w6[1], offset); + w7[2] = hc_bytealign (w5[3], w6[0], offset); + w7[1] = hc_bytealign (w5[2], w5[3], offset); + w7[0] = hc_bytealign (w5[1], w5[2], offset); + w6[3] = hc_bytealign (w5[0], w5[1], offset); + w6[2] = hc_bytealign (w4[3], w5[0], offset); + w6[1] = hc_bytealign (w4[2], w4[3], offset); + w6[0] = hc_bytealign (w4[1], w4[2], offset); + w5[3] = hc_bytealign (w4[0], w4[1], offset); + w5[2] = hc_bytealign (w3[3], w4[0], offset); + w5[1] = hc_bytealign (w3[2], w3[3], offset); + w5[0] = hc_bytealign (w3[1], w3[2], offset); + w4[3] = hc_bytealign (w3[0], w3[1], offset); + w4[2] = hc_bytealign (w2[3], w3[0], offset); + w4[1] = hc_bytealign (w2[2], w2[3], offset); + w4[0] = hc_bytealign (w2[1], w2[2], offset); + w3[3] = hc_bytealign (w2[0], w2[1], offset); + w3[2] = hc_bytealign (w1[3], w2[0], offset); + w3[1] = hc_bytealign (w1[2], w1[3], offset); + w3[0] = hc_bytealign (w1[1], w1[2], offset); + w2[3] = hc_bytealign (w1[0], w1[1], offset); + w2[2] = hc_bytealign (w0[3], w1[0], offset); + w2[1] = hc_bytealign (w0[2], w0[3], offset); + w2[0] = hc_bytealign (w0[1], w0[2], offset); + w1[3] = hc_bytealign (w0[0], w0[1], offset); + w1[2] = hc_bytealign ( 0, w0[0], offset); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 7: + c1[3] = hc_bytealign (w7[3], 0, offset); + c1[2] = hc_bytealign (w7[2], w7[3], offset); + c1[1] = hc_bytealign (w7[1], w7[2], offset); + c1[0] = hc_bytealign (w7[0], w7[1], offset); + c0[3] = hc_bytealign (w6[3], w7[0], offset); + c0[2] = hc_bytealign (w6[2], w6[3], offset); + c0[1] = hc_bytealign (w6[1], w6[2], offset); + c0[0] = hc_bytealign (w6[0], w6[1], offset); + w7[3] = hc_bytealign (w5[3], w6[0], offset); + w7[2] = hc_bytealign (w5[2], w5[3], offset); + w7[1] = hc_bytealign (w5[1], w5[2], offset); + w7[0] = hc_bytealign (w5[0], w5[1], offset); + w6[3] = hc_bytealign (w4[3], w5[0], offset); + w6[2] = hc_bytealign (w4[2], w4[3], offset); + w6[1] = hc_bytealign (w4[1], w4[2], offset); + w6[0] = hc_bytealign (w4[0], w4[1], offset); + w5[3] = hc_bytealign (w3[3], w4[0], offset); + w5[2] = hc_bytealign (w3[2], w3[3], offset); + w5[1] = hc_bytealign (w3[1], w3[2], offset); + w5[0] = hc_bytealign (w3[0], w3[1], offset); + w4[3] = hc_bytealign (w2[3], w3[0], offset); + w4[2] = hc_bytealign (w2[2], w2[3], offset); + w4[1] = hc_bytealign (w2[1], w2[2], offset); + w4[0] = hc_bytealign (w2[0], w2[1], offset); + w3[3] = hc_bytealign (w1[3], w2[0], offset); + w3[2] = hc_bytealign (w1[2], w1[3], offset); + w3[1] = hc_bytealign (w1[1], w1[2], offset); + w3[0] = hc_bytealign (w1[0], w1[1], offset); + w2[3] = hc_bytealign (w0[3], w1[0], offset); + w2[2] = hc_bytealign (w0[2], w0[3], offset); + w2[1] = hc_bytealign (w0[1], w0[2], offset); + w2[0] = hc_bytealign (w0[0], w0[1], offset); + w1[3] = hc_bytealign ( 0, w0[0], offset); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 8: + c2[0] = hc_bytealign (w7[3], 0, offset); + c1[3] = hc_bytealign (w7[2], w7[3], offset); + c1[2] = hc_bytealign (w7[1], w7[2], offset); + c1[1] = hc_bytealign (w7[0], w7[1], offset); + c1[0] = hc_bytealign (w6[3], w7[0], offset); + c0[3] = hc_bytealign (w6[2], w6[3], offset); + c0[2] = hc_bytealign (w6[1], w6[2], offset); + c0[1] = hc_bytealign (w6[0], w6[1], offset); + c0[0] = hc_bytealign (w5[3], w6[0], offset); + w7[3] = hc_bytealign (w5[2], w5[3], offset); + w7[2] = hc_bytealign (w5[1], w5[2], offset); + w7[1] = hc_bytealign (w5[0], w5[1], offset); + w7[0] = hc_bytealign (w4[3], w5[0], offset); + w6[3] = hc_bytealign (w4[2], w4[3], offset); + w6[2] = hc_bytealign (w4[1], w4[2], offset); + w6[1] = hc_bytealign (w4[0], w4[1], offset); + w6[0] = hc_bytealign (w3[3], w4[0], offset); + w5[3] = hc_bytealign (w3[2], w3[3], offset); + w5[2] = hc_bytealign (w3[1], w3[2], offset); + w5[1] = hc_bytealign (w3[0], w3[1], offset); + w5[0] = hc_bytealign (w2[3], w3[0], offset); + w4[3] = hc_bytealign (w2[2], w2[3], offset); + w4[2] = hc_bytealign (w2[1], w2[2], offset); + w4[1] = hc_bytealign (w2[0], w2[1], offset); + w4[0] = hc_bytealign (w1[3], w2[0], offset); + w3[3] = hc_bytealign (w1[2], w1[3], offset); + w3[2] = hc_bytealign (w1[1], w1[2], offset); + w3[1] = hc_bytealign (w1[0], w1[1], offset); + w3[0] = hc_bytealign (w0[3], w1[0], offset); + w2[3] = hc_bytealign (w0[2], w0[3], offset); + w2[2] = hc_bytealign (w0[1], w0[2], offset); + w2[1] = hc_bytealign (w0[0], w0[1], offset); + w2[0] = hc_bytealign ( 0, w0[0], offset); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 9: + c2[1] = hc_bytealign (w7[3], 0, offset); + c2[0] = hc_bytealign (w7[2], w7[3], offset); + c1[3] = hc_bytealign (w7[1], w7[2], offset); + c1[2] = hc_bytealign (w7[0], w7[1], offset); + c1[1] = hc_bytealign (w6[3], w7[0], offset); + c1[0] = hc_bytealign (w6[2], w6[3], offset); + c0[3] = hc_bytealign (w6[1], w6[2], offset); + c0[2] = hc_bytealign (w6[0], w6[1], offset); + c0[1] = hc_bytealign (w5[3], w6[0], offset); + c0[0] = hc_bytealign (w5[2], w5[3], offset); + w7[3] = hc_bytealign (w5[1], w5[2], offset); + w7[2] = hc_bytealign (w5[0], w5[1], offset); + w7[1] = hc_bytealign (w4[3], w5[0], offset); + w7[0] = hc_bytealign (w4[2], w4[3], offset); + w6[3] = hc_bytealign (w4[1], w4[2], offset); + w6[2] = hc_bytealign (w4[0], w4[1], offset); + w6[1] = hc_bytealign (w3[3], w4[0], offset); + w6[0] = hc_bytealign (w3[2], w3[3], offset); + w5[3] = hc_bytealign (w3[1], w3[2], offset); + w5[2] = hc_bytealign (w3[0], w3[1], offset); + w5[1] = hc_bytealign (w2[3], w3[0], offset); + w5[0] = hc_bytealign (w2[2], w2[3], offset); + w4[3] = hc_bytealign (w2[1], w2[2], offset); + w4[2] = hc_bytealign (w2[0], w2[1], offset); + w4[1] = hc_bytealign (w1[3], w2[0], offset); + w4[0] = hc_bytealign (w1[2], w1[3], offset); + w3[3] = hc_bytealign (w1[1], w1[2], offset); + w3[2] = hc_bytealign (w1[0], w1[1], offset); + w3[1] = hc_bytealign (w0[3], w1[0], offset); + w3[0] = hc_bytealign (w0[2], w0[3], offset); + w2[3] = hc_bytealign (w0[1], w0[2], offset); + w2[2] = hc_bytealign (w0[0], w0[1], offset); + w2[1] = hc_bytealign ( 0, w0[0], offset); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 10: + c2[2] = hc_bytealign (w7[3], 0, offset); + c2[1] = hc_bytealign (w7[2], w7[3], offset); + c2[0] = hc_bytealign (w7[1], w7[2], offset); + c1[3] = hc_bytealign (w7[0], w7[1], offset); + c1[2] = hc_bytealign (w6[3], w7[0], offset); + c1[1] = hc_bytealign (w6[2], w6[3], offset); + c1[0] = hc_bytealign (w6[1], w6[2], offset); + c0[3] = hc_bytealign (w6[0], w6[1], offset); + c0[2] = hc_bytealign (w5[3], w6[0], offset); + c0[1] = hc_bytealign (w5[2], w5[3], offset); + c0[0] = hc_bytealign (w5[1], w5[2], offset); + w7[3] = hc_bytealign (w5[0], w5[1], offset); + w7[2] = hc_bytealign (w4[3], w5[0], offset); + w7[1] = hc_bytealign (w4[2], w4[3], offset); + w7[0] = hc_bytealign (w4[1], w4[2], offset); + w6[3] = hc_bytealign (w4[0], w4[1], offset); + w6[2] = hc_bytealign (w3[3], w4[0], offset); + w6[1] = hc_bytealign (w3[2], w3[3], offset); + w6[0] = hc_bytealign (w3[1], w3[2], offset); + w5[3] = hc_bytealign (w3[0], w3[1], offset); + w5[2] = hc_bytealign (w2[3], w3[0], offset); + w5[1] = hc_bytealign (w2[2], w2[3], offset); + w5[0] = hc_bytealign (w2[1], w2[2], offset); + w4[3] = hc_bytealign (w2[0], w2[1], offset); + w4[2] = hc_bytealign (w1[3], w2[0], offset); + w4[1] = hc_bytealign (w1[2], w1[3], offset); + w4[0] = hc_bytealign (w1[1], w1[2], offset); + w3[3] = hc_bytealign (w1[0], w1[1], offset); + w3[2] = hc_bytealign (w0[3], w1[0], offset); + w3[1] = hc_bytealign (w0[2], w0[3], offset); + w3[0] = hc_bytealign (w0[1], w0[2], offset); + w2[3] = hc_bytealign (w0[0], w0[1], offset); + w2[2] = hc_bytealign ( 0, w0[0], offset); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 11: + c2[3] = hc_bytealign (w7[3], 0, offset); + c2[2] = hc_bytealign (w7[2], w7[3], offset); + c2[1] = hc_bytealign (w7[1], w7[2], offset); + c2[0] = hc_bytealign (w7[0], w7[1], offset); + c1[3] = hc_bytealign (w6[3], w7[0], offset); + c1[2] = hc_bytealign (w6[2], w6[3], offset); + c1[1] = hc_bytealign (w6[1], w6[2], offset); + c1[0] = hc_bytealign (w6[0], w6[1], offset); + c0[3] = hc_bytealign (w5[3], w6[0], offset); + c0[2] = hc_bytealign (w5[2], w5[3], offset); + c0[1] = hc_bytealign (w5[1], w5[2], offset); + c0[0] = hc_bytealign (w5[0], w5[1], offset); + w7[3] = hc_bytealign (w4[3], w5[0], offset); + w7[2] = hc_bytealign (w4[2], w4[3], offset); + w7[1] = hc_bytealign (w4[1], w4[2], offset); + w7[0] = hc_bytealign (w4[0], w4[1], offset); + w6[3] = hc_bytealign (w3[3], w4[0], offset); + w6[2] = hc_bytealign (w3[2], w3[3], offset); + w6[1] = hc_bytealign (w3[1], w3[2], offset); + w6[0] = hc_bytealign (w3[0], w3[1], offset); + w5[3] = hc_bytealign (w2[3], w3[0], offset); + w5[2] = hc_bytealign (w2[2], w2[3], offset); + w5[1] = hc_bytealign (w2[1], w2[2], offset); + w5[0] = hc_bytealign (w2[0], w2[1], offset); + w4[3] = hc_bytealign (w1[3], w2[0], offset); + w4[2] = hc_bytealign (w1[2], w1[3], offset); + w4[1] = hc_bytealign (w1[1], w1[2], offset); + w4[0] = hc_bytealign (w1[0], w1[1], offset); + w3[3] = hc_bytealign (w0[3], w1[0], offset); + w3[2] = hc_bytealign (w0[2], w0[3], offset); + w3[1] = hc_bytealign (w0[1], w0[2], offset); + w3[0] = hc_bytealign (w0[0], w0[1], offset); + w2[3] = hc_bytealign ( 0, w0[0], offset); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 12: + c3[0] = hc_bytealign (w7[3], 0, offset); + c2[3] = hc_bytealign (w7[2], w7[3], offset); + c2[2] = hc_bytealign (w7[1], w7[2], offset); + c2[1] = hc_bytealign (w7[0], w7[1], offset); + c2[0] = hc_bytealign (w6[3], w7[0], offset); + c1[3] = hc_bytealign (w6[2], w6[3], offset); + c1[2] = hc_bytealign (w6[1], w6[2], offset); + c1[1] = hc_bytealign (w6[0], w6[1], offset); + c1[0] = hc_bytealign (w5[3], w6[0], offset); + c0[3] = hc_bytealign (w5[2], w5[3], offset); + c0[2] = hc_bytealign (w5[1], w5[2], offset); + c0[1] = hc_bytealign (w5[0], w5[1], offset); + c0[0] = hc_bytealign (w4[3], w5[0], offset); + w7[3] = hc_bytealign (w4[2], w4[3], offset); + w7[2] = hc_bytealign (w4[1], w4[2], offset); + w7[1] = hc_bytealign (w4[0], w4[1], offset); + w7[0] = hc_bytealign (w3[3], w4[0], offset); + w6[3] = hc_bytealign (w3[2], w3[3], offset); + w6[2] = hc_bytealign (w3[1], w3[2], offset); + w6[1] = hc_bytealign (w3[0], w3[1], offset); + w6[0] = hc_bytealign (w2[3], w3[0], offset); + w5[3] = hc_bytealign (w2[2], w2[3], offset); + w5[2] = hc_bytealign (w2[1], w2[2], offset); + w5[1] = hc_bytealign (w2[0], w2[1], offset); + w5[0] = hc_bytealign (w1[3], w2[0], offset); + w4[3] = hc_bytealign (w1[2], w1[3], offset); + w4[2] = hc_bytealign (w1[1], w1[2], offset); + w4[1] = hc_bytealign (w1[0], w1[1], offset); + w4[0] = hc_bytealign (w0[3], w1[0], offset); + w3[3] = hc_bytealign (w0[2], w0[3], offset); + w3[2] = hc_bytealign (w0[1], w0[2], offset); + w3[1] = hc_bytealign (w0[0], w0[1], offset); + w3[0] = hc_bytealign ( 0, w0[0], offset); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 13: + c3[1] = hc_bytealign (w7[3], 0, offset); + c3[0] = hc_bytealign (w7[2], w7[3], offset); + c2[3] = hc_bytealign (w7[1], w7[2], offset); + c2[2] = hc_bytealign (w7[0], w7[1], offset); + c2[1] = hc_bytealign (w6[3], w7[0], offset); + c2[0] = hc_bytealign (w6[2], w6[3], offset); + c1[3] = hc_bytealign (w6[1], w6[2], offset); + c1[2] = hc_bytealign (w6[0], w6[1], offset); + c1[1] = hc_bytealign (w5[3], w6[0], offset); + c1[0] = hc_bytealign (w5[2], w5[3], offset); + c0[3] = hc_bytealign (w5[1], w5[2], offset); + c0[2] = hc_bytealign (w5[0], w5[1], offset); + c0[1] = hc_bytealign (w4[3], w5[0], offset); + c0[0] = hc_bytealign (w4[2], w4[3], offset); + w7[3] = hc_bytealign (w4[1], w4[2], offset); + w7[2] = hc_bytealign (w4[0], w4[1], offset); + w7[1] = hc_bytealign (w3[3], w4[0], offset); + w7[0] = hc_bytealign (w3[2], w3[3], offset); + w6[3] = hc_bytealign (w3[1], w3[2], offset); + w6[2] = hc_bytealign (w3[0], w3[1], offset); + w6[1] = hc_bytealign (w2[3], w3[0], offset); + w6[0] = hc_bytealign (w2[2], w2[3], offset); + w5[3] = hc_bytealign (w2[1], w2[2], offset); + w5[2] = hc_bytealign (w2[0], w2[1], offset); + w5[1] = hc_bytealign (w1[3], w2[0], offset); + w5[0] = hc_bytealign (w1[2], w1[3], offset); + w4[3] = hc_bytealign (w1[1], w1[2], offset); + w4[2] = hc_bytealign (w1[0], w1[1], offset); + w4[1] = hc_bytealign (w0[3], w1[0], offset); + w4[0] = hc_bytealign (w0[2], w0[3], offset); + w3[3] = hc_bytealign (w0[1], w0[2], offset); + w3[2] = hc_bytealign (w0[0], w0[1], offset); + w3[1] = hc_bytealign ( 0, w0[0], offset); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 14: + c3[2] = hc_bytealign (w7[3], 0, offset); + c3[1] = hc_bytealign (w7[2], w7[3], offset); + c3[0] = hc_bytealign (w7[1], w7[2], offset); + c2[3] = hc_bytealign (w7[0], w7[1], offset); + c2[2] = hc_bytealign (w6[3], w7[0], offset); + c2[1] = hc_bytealign (w6[2], w6[3], offset); + c2[0] = hc_bytealign (w6[1], w6[2], offset); + c1[3] = hc_bytealign (w6[0], w6[1], offset); + c1[2] = hc_bytealign (w5[3], w6[0], offset); + c1[1] = hc_bytealign (w5[2], w5[3], offset); + c1[0] = hc_bytealign (w5[1], w5[2], offset); + c0[3] = hc_bytealign (w5[0], w5[1], offset); + c0[2] = hc_bytealign (w4[3], w5[0], offset); + c0[1] = hc_bytealign (w4[2], w4[3], offset); + c0[0] = hc_bytealign (w4[1], w4[2], offset); + w7[3] = hc_bytealign (w4[0], w4[1], offset); + w7[2] = hc_bytealign (w3[3], w4[0], offset); + w7[1] = hc_bytealign (w3[2], w3[3], offset); + w7[0] = hc_bytealign (w3[1], w3[2], offset); + w6[3] = hc_bytealign (w3[0], w3[1], offset); + w6[2] = hc_bytealign (w2[3], w3[0], offset); + w6[1] = hc_bytealign (w2[2], w2[3], offset); + w6[0] = hc_bytealign (w2[1], w2[2], offset); + w5[3] = hc_bytealign (w2[0], w2[1], offset); + w5[2] = hc_bytealign (w1[3], w2[0], offset); + w5[1] = hc_bytealign (w1[2], w1[3], offset); + w5[0] = hc_bytealign (w1[1], w1[2], offset); + w4[3] = hc_bytealign (w1[0], w1[1], offset); + w4[2] = hc_bytealign (w0[3], w1[0], offset); + w4[1] = hc_bytealign (w0[2], w0[3], offset); + w4[0] = hc_bytealign (w0[1], w0[2], offset); + w3[3] = hc_bytealign (w0[0], w0[1], offset); + w3[2] = hc_bytealign ( 0, w0[0], offset); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 15: + c3[3] = hc_bytealign (w7[3], 0, offset); + c3[2] = hc_bytealign (w7[2], w7[3], offset); + c3[1] = hc_bytealign (w7[1], w7[2], offset); + c3[0] = hc_bytealign (w7[0], w7[1], offset); + c2[3] = hc_bytealign (w6[3], w7[0], offset); + c2[2] = hc_bytealign (w6[2], w6[3], offset); + c2[1] = hc_bytealign (w6[1], w6[2], offset); + c2[0] = hc_bytealign (w6[0], w6[1], offset); + c1[3] = hc_bytealign (w5[3], w6[0], offset); + c1[2] = hc_bytealign (w5[2], w5[3], offset); + c1[1] = hc_bytealign (w5[1], w5[2], offset); + c1[0] = hc_bytealign (w5[0], w5[1], offset); + c0[3] = hc_bytealign (w4[3], w5[0], offset); + c0[2] = hc_bytealign (w4[2], w4[3], offset); + c0[1] = hc_bytealign (w4[1], w4[2], offset); + c0[0] = hc_bytealign (w4[0], w4[1], offset); + w7[3] = hc_bytealign (w3[3], w4[0], offset); + w7[2] = hc_bytealign (w3[2], w3[3], offset); + w7[1] = hc_bytealign (w3[1], w3[2], offset); + w7[0] = hc_bytealign (w3[0], w3[1], offset); + w6[3] = hc_bytealign (w2[3], w3[0], offset); + w6[2] = hc_bytealign (w2[2], w2[3], offset); + w6[1] = hc_bytealign (w2[1], w2[2], offset); + w6[0] = hc_bytealign (w2[0], w2[1], offset); + w5[3] = hc_bytealign (w1[3], w2[0], offset); + w5[2] = hc_bytealign (w1[2], w1[3], offset); + w5[1] = hc_bytealign (w1[1], w1[2], offset); + w5[0] = hc_bytealign (w1[0], w1[1], offset); + w4[3] = hc_bytealign (w0[3], w1[0], offset); + w4[2] = hc_bytealign (w0[2], w0[3], offset); + w4[1] = hc_bytealign (w0[1], w0[2], offset); + w4[0] = hc_bytealign (w0[0], w0[1], offset); + w3[3] = hc_bytealign ( 0, w0[0], offset); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 16: + c4[0] = hc_bytealign (w7[3], 0, offset); + c3[3] = hc_bytealign (w7[2], w7[3], offset); + c3[2] = hc_bytealign (w7[1], w7[2], offset); + c3[1] = hc_bytealign (w7[0], w7[1], offset); + c3[0] = hc_bytealign (w6[3], w7[0], offset); + c2[3] = hc_bytealign (w6[2], w6[3], offset); + c2[2] = hc_bytealign (w6[1], w6[2], offset); + c2[1] = hc_bytealign (w6[0], w6[1], offset); + c2[0] = hc_bytealign (w5[3], w6[0], offset); + c1[3] = hc_bytealign (w5[2], w5[3], offset); + c1[2] = hc_bytealign (w5[1], w5[2], offset); + c1[1] = hc_bytealign (w5[0], w5[1], offset); + c1[0] = hc_bytealign (w4[3], w5[0], offset); + c0[3] = hc_bytealign (w4[2], w4[3], offset); + c0[2] = hc_bytealign (w4[1], w4[2], offset); + c0[1] = hc_bytealign (w4[0], w4[1], offset); + c0[0] = hc_bytealign (w3[3], w4[0], offset); + w7[3] = hc_bytealign (w3[2], w3[3], offset); + w7[2] = hc_bytealign (w3[1], w3[2], offset); + w7[1] = hc_bytealign (w3[0], w3[1], offset); + w7[0] = hc_bytealign (w2[3], w3[0], offset); + w6[3] = hc_bytealign (w2[2], w2[3], offset); + w6[2] = hc_bytealign (w2[1], w2[2], offset); + w6[1] = hc_bytealign (w2[0], w2[1], offset); + w6[0] = hc_bytealign (w1[3], w2[0], offset); + w5[3] = hc_bytealign (w1[2], w1[3], offset); + w5[2] = hc_bytealign (w1[1], w1[2], offset); + w5[1] = hc_bytealign (w1[0], w1[1], offset); + w5[0] = hc_bytealign (w0[3], w1[0], offset); + w4[3] = hc_bytealign (w0[2], w0[3], offset); + w4[2] = hc_bytealign (w0[1], w0[2], offset); + w4[1] = hc_bytealign (w0[0], w0[1], offset); + w4[0] = hc_bytealign ( 0, w0[0], offset); + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 17: + c4[1] = hc_bytealign (w7[3], 0, offset); + c4[0] = hc_bytealign (w7[2], w7[3], offset); + c3[3] = hc_bytealign (w7[1], w7[2], offset); + c3[2] = hc_bytealign (w7[0], w7[1], offset); + c3[1] = hc_bytealign (w6[3], w7[0], offset); + c3[0] = hc_bytealign (w6[2], w6[3], offset); + c2[3] = hc_bytealign (w6[1], w6[2], offset); + c2[2] = hc_bytealign (w6[0], w6[1], offset); + c2[1] = hc_bytealign (w5[3], w6[0], offset); + c2[0] = hc_bytealign (w5[2], w5[3], offset); + c1[3] = hc_bytealign (w5[1], w5[2], offset); + c1[2] = hc_bytealign (w5[0], w5[1], offset); + c1[1] = hc_bytealign (w4[3], w5[0], offset); + c1[0] = hc_bytealign (w4[2], w4[3], offset); + c0[3] = hc_bytealign (w4[1], w4[2], offset); + c0[2] = hc_bytealign (w4[0], w4[1], offset); + c0[1] = hc_bytealign (w3[3], w4[0], offset); + c0[0] = hc_bytealign (w3[2], w3[3], offset); + w7[3] = hc_bytealign (w3[1], w3[2], offset); + w7[2] = hc_bytealign (w3[0], w3[1], offset); + w7[1] = hc_bytealign (w2[3], w3[0], offset); + w7[0] = hc_bytealign (w2[2], w2[3], offset); + w6[3] = hc_bytealign (w2[1], w2[2], offset); + w6[2] = hc_bytealign (w2[0], w2[1], offset); + w6[1] = hc_bytealign (w1[3], w2[0], offset); + w6[0] = hc_bytealign (w1[2], w1[3], offset); + w5[3] = hc_bytealign (w1[1], w1[2], offset); + w5[2] = hc_bytealign (w1[0], w1[1], offset); + w5[1] = hc_bytealign (w0[3], w1[0], offset); + w5[0] = hc_bytealign (w0[2], w0[3], offset); + w4[3] = hc_bytealign (w0[1], w0[2], offset); + w4[2] = hc_bytealign (w0[0], w0[1], offset); + w4[1] = hc_bytealign ( 0, w0[0], offset); + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 18: + c4[2] = hc_bytealign (w7[3], 0, offset); + c4[1] = hc_bytealign (w7[2], w7[3], offset); + c4[0] = hc_bytealign (w7[1], w7[2], offset); + c3[3] = hc_bytealign (w7[0], w7[1], offset); + c3[2] = hc_bytealign (w6[3], w7[0], offset); + c3[1] = hc_bytealign (w6[2], w6[3], offset); + c3[0] = hc_bytealign (w6[1], w6[2], offset); + c2[3] = hc_bytealign (w6[0], w6[1], offset); + c2[2] = hc_bytealign (w5[3], w6[0], offset); + c2[1] = hc_bytealign (w5[2], w5[3], offset); + c2[0] = hc_bytealign (w5[1], w5[2], offset); + c1[3] = hc_bytealign (w5[0], w5[1], offset); + c1[2] = hc_bytealign (w4[3], w5[0], offset); + c1[1] = hc_bytealign (w4[2], w4[3], offset); + c1[0] = hc_bytealign (w4[1], w4[2], offset); + c0[3] = hc_bytealign (w4[0], w4[1], offset); + c0[2] = hc_bytealign (w3[3], w4[0], offset); + c0[1] = hc_bytealign (w3[2], w3[3], offset); + c0[0] = hc_bytealign (w3[1], w3[2], offset); + w7[3] = hc_bytealign (w3[0], w3[1], offset); + w7[2] = hc_bytealign (w2[3], w3[0], offset); + w7[1] = hc_bytealign (w2[2], w2[3], offset); + w7[0] = hc_bytealign (w2[1], w2[2], offset); + w6[3] = hc_bytealign (w2[0], w2[1], offset); + w6[2] = hc_bytealign (w1[3], w2[0], offset); + w6[1] = hc_bytealign (w1[2], w1[3], offset); + w6[0] = hc_bytealign (w1[1], w1[2], offset); + w5[3] = hc_bytealign (w1[0], w1[1], offset); + w5[2] = hc_bytealign (w0[3], w1[0], offset); + w5[1] = hc_bytealign (w0[2], w0[3], offset); + w5[0] = hc_bytealign (w0[1], w0[2], offset); + w4[3] = hc_bytealign (w0[0], w0[1], offset); + w4[2] = hc_bytealign ( 0, w0[0], offset); + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 19: + c4[3] = hc_bytealign (w7[3], 0, offset); + c4[2] = hc_bytealign (w7[2], w7[3], offset); + c4[1] = hc_bytealign (w7[1], w7[2], offset); + c4[0] = hc_bytealign (w7[0], w7[1], offset); + c3[3] = hc_bytealign (w6[3], w7[0], offset); + c3[2] = hc_bytealign (w6[2], w6[3], offset); + c3[1] = hc_bytealign (w6[1], w6[2], offset); + c3[0] = hc_bytealign (w6[0], w6[1], offset); + c2[3] = hc_bytealign (w5[3], w6[0], offset); + c2[2] = hc_bytealign (w5[2], w5[3], offset); + c2[1] = hc_bytealign (w5[1], w5[2], offset); + c2[0] = hc_bytealign (w5[0], w5[1], offset); + c1[3] = hc_bytealign (w4[3], w5[0], offset); + c1[2] = hc_bytealign (w4[2], w4[3], offset); + c1[1] = hc_bytealign (w4[1], w4[2], offset); + c1[0] = hc_bytealign (w4[0], w4[1], offset); + c0[3] = hc_bytealign (w3[3], w4[0], offset); + c0[2] = hc_bytealign (w3[2], w3[3], offset); + c0[1] = hc_bytealign (w3[1], w3[2], offset); + c0[0] = hc_bytealign (w3[0], w3[1], offset); + w7[3] = hc_bytealign (w2[3], w3[0], offset); + w7[2] = hc_bytealign (w2[2], w2[3], offset); + w7[1] = hc_bytealign (w2[1], w2[2], offset); + w7[0] = hc_bytealign (w2[0], w2[1], offset); + w6[3] = hc_bytealign (w1[3], w2[0], offset); + w6[2] = hc_bytealign (w1[2], w1[3], offset); + w6[1] = hc_bytealign (w1[1], w1[2], offset); + w6[0] = hc_bytealign (w1[0], w1[1], offset); + w5[3] = hc_bytealign (w0[3], w1[0], offset); + w5[2] = hc_bytealign (w0[2], w0[3], offset); + w5[1] = hc_bytealign (w0[1], w0[2], offset); + w5[0] = hc_bytealign (w0[0], w0[1], offset); + w4[3] = hc_bytealign ( 0, w0[0], offset); + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 20: + c5[0] = hc_bytealign (w7[3], 0, offset); + c4[3] = hc_bytealign (w7[2], w7[3], offset); + c4[2] = hc_bytealign (w7[1], w7[2], offset); + c4[1] = hc_bytealign (w7[0], w7[1], offset); + c4[0] = hc_bytealign (w6[3], w7[0], offset); + c3[3] = hc_bytealign (w6[2], w6[3], offset); + c3[2] = hc_bytealign (w6[1], w6[2], offset); + c3[1] = hc_bytealign (w6[0], w6[1], offset); + c3[0] = hc_bytealign (w5[3], w6[0], offset); + c2[3] = hc_bytealign (w5[2], w5[3], offset); + c2[2] = hc_bytealign (w5[1], w5[2], offset); + c2[1] = hc_bytealign (w5[0], w5[1], offset); + c2[0] = hc_bytealign (w4[3], w5[0], offset); + c1[3] = hc_bytealign (w4[2], w4[3], offset); + c1[2] = hc_bytealign (w4[1], w4[2], offset); + c1[1] = hc_bytealign (w4[0], w4[1], offset); + c1[0] = hc_bytealign (w3[3], w4[0], offset); + c0[3] = hc_bytealign (w3[2], w3[3], offset); + c0[2] = hc_bytealign (w3[1], w3[2], offset); + c0[1] = hc_bytealign (w3[0], w3[1], offset); + c0[0] = hc_bytealign (w2[3], w3[0], offset); + w7[3] = hc_bytealign (w2[2], w2[3], offset); + w7[2] = hc_bytealign (w2[1], w2[2], offset); + w7[1] = hc_bytealign (w2[0], w2[1], offset); + w7[0] = hc_bytealign (w1[3], w2[0], offset); + w6[3] = hc_bytealign (w1[2], w1[3], offset); + w6[2] = hc_bytealign (w1[1], w1[2], offset); + w6[1] = hc_bytealign (w1[0], w1[1], offset); + w6[0] = hc_bytealign (w0[3], w1[0], offset); + w5[3] = hc_bytealign (w0[2], w0[3], offset); + w5[2] = hc_bytealign (w0[1], w0[2], offset); + w5[1] = hc_bytealign (w0[0], w0[1], offset); + w5[0] = hc_bytealign ( 0, w0[0], offset); + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 21: + c5[1] = hc_bytealign (w7[3], 0, offset); + c5[0] = hc_bytealign (w7[2], w7[3], offset); + c4[3] = hc_bytealign (w7[1], w7[2], offset); + c4[2] = hc_bytealign (w7[0], w7[1], offset); + c4[1] = hc_bytealign (w6[3], w7[0], offset); + c4[0] = hc_bytealign (w6[2], w6[3], offset); + c3[3] = hc_bytealign (w6[1], w6[2], offset); + c3[2] = hc_bytealign (w6[0], w6[1], offset); + c3[1] = hc_bytealign (w5[3], w6[0], offset); + c3[0] = hc_bytealign (w5[2], w5[3], offset); + c2[3] = hc_bytealign (w5[1], w5[2], offset); + c2[2] = hc_bytealign (w5[0], w5[1], offset); + c2[1] = hc_bytealign (w4[3], w5[0], offset); + c2[0] = hc_bytealign (w4[2], w4[3], offset); + c1[3] = hc_bytealign (w4[1], w4[2], offset); + c1[2] = hc_bytealign (w4[0], w4[1], offset); + c1[1] = hc_bytealign (w3[3], w4[0], offset); + c1[0] = hc_bytealign (w3[2], w3[3], offset); + c0[3] = hc_bytealign (w3[1], w3[2], offset); + c0[2] = hc_bytealign (w3[0], w3[1], offset); + c0[1] = hc_bytealign (w2[3], w3[0], offset); + c0[0] = hc_bytealign (w2[2], w2[3], offset); + w7[3] = hc_bytealign (w2[1], w2[2], offset); + w7[2] = hc_bytealign (w2[0], w2[1], offset); + w7[1] = hc_bytealign (w1[3], w2[0], offset); + w7[0] = hc_bytealign (w1[2], w1[3], offset); + w6[3] = hc_bytealign (w1[1], w1[2], offset); + w6[2] = hc_bytealign (w1[0], w1[1], offset); + w6[1] = hc_bytealign (w0[3], w1[0], offset); + w6[0] = hc_bytealign (w0[2], w0[3], offset); + w5[3] = hc_bytealign (w0[1], w0[2], offset); + w5[2] = hc_bytealign (w0[0], w0[1], offset); + w5[1] = hc_bytealign ( 0, w0[0], offset); + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 22: + c5[2] = hc_bytealign (w7[3], 0, offset); + c5[1] = hc_bytealign (w7[2], w7[3], offset); + c5[0] = hc_bytealign (w7[1], w7[2], offset); + c4[3] = hc_bytealign (w7[0], w7[1], offset); + c4[2] = hc_bytealign (w6[3], w7[0], offset); + c4[1] = hc_bytealign (w6[2], w6[3], offset); + c4[0] = hc_bytealign (w6[1], w6[2], offset); + c3[3] = hc_bytealign (w6[0], w6[1], offset); + c3[2] = hc_bytealign (w5[3], w6[0], offset); + c3[1] = hc_bytealign (w5[2], w5[3], offset); + c3[0] = hc_bytealign (w5[1], w5[2], offset); + c2[3] = hc_bytealign (w5[0], w5[1], offset); + c2[2] = hc_bytealign (w4[3], w5[0], offset); + c2[1] = hc_bytealign (w4[2], w4[3], offset); + c2[0] = hc_bytealign (w4[1], w4[2], offset); + c1[3] = hc_bytealign (w4[0], w4[1], offset); + c1[2] = hc_bytealign (w3[3], w4[0], offset); + c1[1] = hc_bytealign (w3[2], w3[3], offset); + c1[0] = hc_bytealign (w3[1], w3[2], offset); + c0[3] = hc_bytealign (w3[0], w3[1], offset); + c0[2] = hc_bytealign (w2[3], w3[0], offset); + c0[1] = hc_bytealign (w2[2], w2[3], offset); + c0[0] = hc_bytealign (w2[1], w2[2], offset); + w7[3] = hc_bytealign (w2[0], w2[1], offset); + w7[2] = hc_bytealign (w1[3], w2[0], offset); + w7[1] = hc_bytealign (w1[2], w1[3], offset); + w7[0] = hc_bytealign (w1[1], w1[2], offset); + w6[3] = hc_bytealign (w1[0], w1[1], offset); + w6[2] = hc_bytealign (w0[3], w1[0], offset); + w6[1] = hc_bytealign (w0[2], w0[3], offset); + w6[0] = hc_bytealign (w0[1], w0[2], offset); + w5[3] = hc_bytealign (w0[0], w0[1], offset); + w5[2] = hc_bytealign ( 0, w0[0], offset); + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 23: + c5[3] = hc_bytealign (w7[3], 0, offset); + c5[2] = hc_bytealign (w7[2], w7[3], offset); + c5[1] = hc_bytealign (w7[1], w7[2], offset); + c5[0] = hc_bytealign (w7[0], w7[1], offset); + c4[3] = hc_bytealign (w6[3], w7[0], offset); + c4[2] = hc_bytealign (w6[2], w6[3], offset); + c4[1] = hc_bytealign (w6[1], w6[2], offset); + c4[0] = hc_bytealign (w6[0], w6[1], offset); + c3[3] = hc_bytealign (w5[3], w6[0], offset); + c3[2] = hc_bytealign (w5[2], w5[3], offset); + c3[1] = hc_bytealign (w5[1], w5[2], offset); + c3[0] = hc_bytealign (w5[0], w5[1], offset); + c2[3] = hc_bytealign (w4[3], w5[0], offset); + c2[2] = hc_bytealign (w4[2], w4[3], offset); + c2[1] = hc_bytealign (w4[1], w4[2], offset); + c2[0] = hc_bytealign (w4[0], w4[1], offset); + c1[3] = hc_bytealign (w3[3], w4[0], offset); + c1[2] = hc_bytealign (w3[2], w3[3], offset); + c1[1] = hc_bytealign (w3[1], w3[2], offset); + c1[0] = hc_bytealign (w3[0], w3[1], offset); + c0[3] = hc_bytealign (w2[3], w3[0], offset); + c0[2] = hc_bytealign (w2[2], w2[3], offset); + c0[1] = hc_bytealign (w2[1], w2[2], offset); + c0[0] = hc_bytealign (w2[0], w2[1], offset); + w7[3] = hc_bytealign (w1[3], w2[0], offset); + w7[2] = hc_bytealign (w1[2], w1[3], offset); + w7[1] = hc_bytealign (w1[1], w1[2], offset); + w7[0] = hc_bytealign (w1[0], w1[1], offset); + w6[3] = hc_bytealign (w0[3], w1[0], offset); + w6[2] = hc_bytealign (w0[2], w0[3], offset); + w6[1] = hc_bytealign (w0[1], w0[2], offset); + w6[0] = hc_bytealign (w0[0], w0[1], offset); + w5[3] = hc_bytealign ( 0, w0[0], offset); + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 24: + c6[0] = hc_bytealign (w7[3], 0, offset); + c5[3] = hc_bytealign (w7[2], w7[3], offset); + c5[2] = hc_bytealign (w7[1], w7[2], offset); + c5[1] = hc_bytealign (w7[0], w7[1], offset); + c5[0] = hc_bytealign (w6[3], w7[0], offset); + c4[3] = hc_bytealign (w6[2], w6[3], offset); + c4[2] = hc_bytealign (w6[1], w6[2], offset); + c4[1] = hc_bytealign (w6[0], w6[1], offset); + c4[0] = hc_bytealign (w5[3], w6[0], offset); + c3[3] = hc_bytealign (w5[2], w5[3], offset); + c3[2] = hc_bytealign (w5[1], w5[2], offset); + c3[1] = hc_bytealign (w5[0], w5[1], offset); + c3[0] = hc_bytealign (w4[3], w5[0], offset); + c2[3] = hc_bytealign (w4[2], w4[3], offset); + c2[2] = hc_bytealign (w4[1], w4[2], offset); + c2[1] = hc_bytealign (w4[0], w4[1], offset); + c2[0] = hc_bytealign (w3[3], w4[0], offset); + c1[3] = hc_bytealign (w3[2], w3[3], offset); + c1[2] = hc_bytealign (w3[1], w3[2], offset); + c1[1] = hc_bytealign (w3[0], w3[1], offset); + c1[0] = hc_bytealign (w2[3], w3[0], offset); + c0[3] = hc_bytealign (w2[2], w2[3], offset); + c0[2] = hc_bytealign (w2[1], w2[2], offset); + c0[1] = hc_bytealign (w2[0], w2[1], offset); + c0[0] = hc_bytealign (w1[3], w2[0], offset); + w7[3] = hc_bytealign (w1[2], w1[3], offset); + w7[2] = hc_bytealign (w1[1], w1[2], offset); + w7[1] = hc_bytealign (w1[0], w1[1], offset); + w7[0] = hc_bytealign (w0[3], w1[0], offset); + w6[3] = hc_bytealign (w0[2], w0[3], offset); + w6[2] = hc_bytealign (w0[1], w0[2], offset); + w6[1] = hc_bytealign (w0[0], w0[1], offset); + w6[0] = hc_bytealign ( 0, w0[0], offset); + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 25: + c6[1] = hc_bytealign (w7[3], 0, offset); + c6[0] = hc_bytealign (w7[2], w7[3], offset); + c5[3] = hc_bytealign (w7[1], w7[2], offset); + c5[2] = hc_bytealign (w7[0], w7[1], offset); + c5[1] = hc_bytealign (w6[3], w7[0], offset); + c5[0] = hc_bytealign (w6[2], w6[3], offset); + c4[3] = hc_bytealign (w6[1], w6[2], offset); + c4[2] = hc_bytealign (w6[0], w6[1], offset); + c4[1] = hc_bytealign (w5[3], w6[0], offset); + c4[0] = hc_bytealign (w5[2], w5[3], offset); + c3[3] = hc_bytealign (w5[1], w5[2], offset); + c3[2] = hc_bytealign (w5[0], w5[1], offset); + c3[1] = hc_bytealign (w4[3], w5[0], offset); + c3[0] = hc_bytealign (w4[2], w4[3], offset); + c2[3] = hc_bytealign (w4[1], w4[2], offset); + c2[2] = hc_bytealign (w4[0], w4[1], offset); + c2[1] = hc_bytealign (w3[3], w4[0], offset); + c2[0] = hc_bytealign (w3[2], w3[3], offset); + c1[3] = hc_bytealign (w3[1], w3[2], offset); + c1[2] = hc_bytealign (w3[0], w3[1], offset); + c1[1] = hc_bytealign (w2[3], w3[0], offset); + c1[0] = hc_bytealign (w2[2], w2[3], offset); + c0[3] = hc_bytealign (w2[1], w2[2], offset); + c0[2] = hc_bytealign (w2[0], w2[1], offset); + c0[1] = hc_bytealign (w1[3], w2[0], offset); + c0[0] = hc_bytealign (w1[2], w1[3], offset); + w7[3] = hc_bytealign (w1[1], w1[2], offset); + w7[2] = hc_bytealign (w1[0], w1[1], offset); + w7[1] = hc_bytealign (w0[3], w1[0], offset); + w7[0] = hc_bytealign (w0[2], w0[3], offset); + w6[3] = hc_bytealign (w0[1], w0[2], offset); + w6[2] = hc_bytealign (w0[0], w0[1], offset); + w6[1] = hc_bytealign ( 0, w0[0], offset); + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 26: + c6[2] = hc_bytealign (w7[3], 0, offset); + c6[1] = hc_bytealign (w7[2], w7[3], offset); + c6[0] = hc_bytealign (w7[1], w7[2], offset); + c5[3] = hc_bytealign (w7[0], w7[1], offset); + c5[2] = hc_bytealign (w6[3], w7[0], offset); + c5[1] = hc_bytealign (w6[2], w6[3], offset); + c5[0] = hc_bytealign (w6[1], w6[2], offset); + c4[3] = hc_bytealign (w6[0], w6[1], offset); + c4[2] = hc_bytealign (w5[3], w6[0], offset); + c4[1] = hc_bytealign (w5[2], w5[3], offset); + c4[0] = hc_bytealign (w5[1], w5[2], offset); + c3[3] = hc_bytealign (w5[0], w5[1], offset); + c3[2] = hc_bytealign (w4[3], w5[0], offset); + c3[1] = hc_bytealign (w4[2], w4[3], offset); + c3[0] = hc_bytealign (w4[1], w4[2], offset); + c2[3] = hc_bytealign (w4[0], w4[1], offset); + c2[2] = hc_bytealign (w3[3], w4[0], offset); + c2[1] = hc_bytealign (w3[2], w3[3], offset); + c2[0] = hc_bytealign (w3[1], w3[2], offset); + c1[3] = hc_bytealign (w3[0], w3[1], offset); + c1[2] = hc_bytealign (w2[3], w3[0], offset); + c1[1] = hc_bytealign (w2[2], w2[3], offset); + c1[0] = hc_bytealign (w2[1], w2[2], offset); + c0[3] = hc_bytealign (w2[0], w2[1], offset); + c0[2] = hc_bytealign (w1[3], w2[0], offset); + c0[1] = hc_bytealign (w1[2], w1[3], offset); + c0[0] = hc_bytealign (w1[1], w1[2], offset); + w7[3] = hc_bytealign (w1[0], w1[1], offset); + w7[2] = hc_bytealign (w0[3], w1[0], offset); + w7[1] = hc_bytealign (w0[2], w0[3], offset); + w7[0] = hc_bytealign (w0[1], w0[2], offset); + w6[3] = hc_bytealign (w0[0], w0[1], offset); + w6[2] = hc_bytealign ( 0, w0[0], offset); + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 27: + c6[3] = hc_bytealign (w7[3], 0, offset); + c6[2] = hc_bytealign (w7[2], w7[3], offset); + c6[1] = hc_bytealign (w7[1], w7[2], offset); + c6[0] = hc_bytealign (w7[0], w7[1], offset); + c5[3] = hc_bytealign (w6[3], w7[0], offset); + c5[2] = hc_bytealign (w6[2], w6[3], offset); + c5[1] = hc_bytealign (w6[1], w6[2], offset); + c5[0] = hc_bytealign (w6[0], w6[1], offset); + c4[3] = hc_bytealign (w5[3], w6[0], offset); + c4[2] = hc_bytealign (w5[2], w5[3], offset); + c4[1] = hc_bytealign (w5[1], w5[2], offset); + c4[0] = hc_bytealign (w5[0], w5[1], offset); + c3[3] = hc_bytealign (w4[3], w5[0], offset); + c3[2] = hc_bytealign (w4[2], w4[3], offset); + c3[1] = hc_bytealign (w4[1], w4[2], offset); + c3[0] = hc_bytealign (w4[0], w4[1], offset); + c2[3] = hc_bytealign (w3[3], w4[0], offset); + c2[2] = hc_bytealign (w3[2], w3[3], offset); + c2[1] = hc_bytealign (w3[1], w3[2], offset); + c2[0] = hc_bytealign (w3[0], w3[1], offset); + c1[3] = hc_bytealign (w2[3], w3[0], offset); + c1[2] = hc_bytealign (w2[2], w2[3], offset); + c1[1] = hc_bytealign (w2[1], w2[2], offset); + c1[0] = hc_bytealign (w2[0], w2[1], offset); + c0[3] = hc_bytealign (w1[3], w2[0], offset); + c0[2] = hc_bytealign (w1[2], w1[3], offset); + c0[1] = hc_bytealign (w1[1], w1[2], offset); + c0[0] = hc_bytealign (w1[0], w1[1], offset); + w7[3] = hc_bytealign (w0[3], w1[0], offset); + w7[2] = hc_bytealign (w0[2], w0[3], offset); + w7[1] = hc_bytealign (w0[1], w0[2], offset); + w7[0] = hc_bytealign (w0[0], w0[1], offset); + w6[3] = hc_bytealign ( 0, w0[0], offset); + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 28: + c7[0] = hc_bytealign (w7[3], 0, offset); + c6[3] = hc_bytealign (w7[2], w7[3], offset); + c6[2] = hc_bytealign (w7[1], w7[2], offset); + c6[1] = hc_bytealign (w7[0], w7[1], offset); + c6[0] = hc_bytealign (w6[3], w7[0], offset); + c5[3] = hc_bytealign (w6[2], w6[3], offset); + c5[2] = hc_bytealign (w6[1], w6[2], offset); + c5[1] = hc_bytealign (w6[0], w6[1], offset); + c5[0] = hc_bytealign (w5[3], w6[0], offset); + c4[3] = hc_bytealign (w5[2], w5[3], offset); + c4[2] = hc_bytealign (w5[1], w5[2], offset); + c4[1] = hc_bytealign (w5[0], w5[1], offset); + c4[0] = hc_bytealign (w4[3], w5[0], offset); + c3[3] = hc_bytealign (w4[2], w4[3], offset); + c3[2] = hc_bytealign (w4[1], w4[2], offset); + c3[1] = hc_bytealign (w4[0], w4[1], offset); + c3[0] = hc_bytealign (w3[3], w4[0], offset); + c2[3] = hc_bytealign (w3[2], w3[3], offset); + c2[2] = hc_bytealign (w3[1], w3[2], offset); + c2[1] = hc_bytealign (w3[0], w3[1], offset); + c2[0] = hc_bytealign (w2[3], w3[0], offset); + c1[3] = hc_bytealign (w2[2], w2[3], offset); + c1[2] = hc_bytealign (w2[1], w2[2], offset); + c1[1] = hc_bytealign (w2[0], w2[1], offset); + c1[0] = hc_bytealign (w1[3], w2[0], offset); + c0[3] = hc_bytealign (w1[2], w1[3], offset); + c0[2] = hc_bytealign (w1[1], w1[2], offset); + c0[1] = hc_bytealign (w1[0], w1[1], offset); + c0[0] = hc_bytealign (w0[3], w1[0], offset); + w7[3] = hc_bytealign (w0[2], w0[3], offset); + w7[2] = hc_bytealign (w0[1], w0[2], offset); + w7[1] = hc_bytealign (w0[0], w0[1], offset); + w7[0] = hc_bytealign ( 0, w0[0], offset); + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 29: + c7[1] = hc_bytealign (w7[3], 0, offset); + c7[0] = hc_bytealign (w7[2], w7[3], offset); + c6[3] = hc_bytealign (w7[1], w7[2], offset); + c6[2] = hc_bytealign (w7[0], w7[1], offset); + c6[1] = hc_bytealign (w6[3], w7[0], offset); + c6[0] = hc_bytealign (w6[2], w6[3], offset); + c5[3] = hc_bytealign (w6[1], w6[2], offset); + c5[2] = hc_bytealign (w6[0], w6[1], offset); + c5[1] = hc_bytealign (w5[3], w6[0], offset); + c5[0] = hc_bytealign (w5[2], w5[3], offset); + c4[3] = hc_bytealign (w5[1], w5[2], offset); + c4[2] = hc_bytealign (w5[0], w5[1], offset); + c4[1] = hc_bytealign (w4[3], w5[0], offset); + c4[0] = hc_bytealign (w4[2], w4[3], offset); + c3[3] = hc_bytealign (w4[1], w4[2], offset); + c3[2] = hc_bytealign (w4[0], w4[1], offset); + c3[1] = hc_bytealign (w3[3], w4[0], offset); + c3[0] = hc_bytealign (w3[2], w3[3], offset); + c2[3] = hc_bytealign (w3[1], w3[2], offset); + c2[2] = hc_bytealign (w3[0], w3[1], offset); + c2[1] = hc_bytealign (w2[3], w3[0], offset); + c2[0] = hc_bytealign (w2[2], w2[3], offset); + c1[3] = hc_bytealign (w2[1], w2[2], offset); + c1[2] = hc_bytealign (w2[0], w2[1], offset); + c1[1] = hc_bytealign (w1[3], w2[0], offset); + c1[0] = hc_bytealign (w1[2], w1[3], offset); + c0[3] = hc_bytealign (w1[1], w1[2], offset); + c0[2] = hc_bytealign (w1[0], w1[1], offset); + c0[1] = hc_bytealign (w0[3], w1[0], offset); + c0[0] = hc_bytealign (w0[2], w0[3], offset); + w7[3] = hc_bytealign (w0[1], w0[2], offset); + w7[2] = hc_bytealign (w0[0], w0[1], offset); + w7[1] = hc_bytealign ( 0, w0[0], offset); + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 30: + c7[2] = hc_bytealign (w7[3], 0, offset); + c7[1] = hc_bytealign (w7[2], w7[3], offset); + c7[0] = hc_bytealign (w7[1], w7[2], offset); + c6[3] = hc_bytealign (w7[0], w7[1], offset); + c6[2] = hc_bytealign (w6[3], w7[0], offset); + c6[1] = hc_bytealign (w6[2], w6[3], offset); + c6[0] = hc_bytealign (w6[1], w6[2], offset); + c5[3] = hc_bytealign (w6[0], w6[1], offset); + c5[2] = hc_bytealign (w5[3], w6[0], offset); + c5[1] = hc_bytealign (w5[2], w5[3], offset); + c5[0] = hc_bytealign (w5[1], w5[2], offset); + c4[3] = hc_bytealign (w5[0], w5[1], offset); + c4[2] = hc_bytealign (w4[3], w5[0], offset); + c4[1] = hc_bytealign (w4[2], w4[3], offset); + c4[0] = hc_bytealign (w4[1], w4[2], offset); + c3[3] = hc_bytealign (w4[0], w4[1], offset); + c3[2] = hc_bytealign (w3[3], w4[0], offset); + c3[1] = hc_bytealign (w3[2], w3[3], offset); + c3[0] = hc_bytealign (w3[1], w3[2], offset); + c2[3] = hc_bytealign (w3[0], w3[1], offset); + c2[2] = hc_bytealign (w2[3], w3[0], offset); + c2[1] = hc_bytealign (w2[2], w2[3], offset); + c2[0] = hc_bytealign (w2[1], w2[2], offset); + c1[3] = hc_bytealign (w2[0], w2[1], offset); + c1[2] = hc_bytealign (w1[3], w2[0], offset); + c1[1] = hc_bytealign (w1[2], w1[3], offset); + c1[0] = hc_bytealign (w1[1], w1[2], offset); + c0[3] = hc_bytealign (w1[0], w1[1], offset); + c0[2] = hc_bytealign (w0[3], w1[0], offset); + c0[1] = hc_bytealign (w0[2], w0[3], offset); + c0[0] = hc_bytealign (w0[1], w0[2], offset); + w7[3] = hc_bytealign (w0[0], w0[1], offset); + w7[2] = hc_bytealign ( 0, w0[0], offset); + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 31: + c7[3] = hc_bytealign (w7[3], 0, offset); + c7[2] = hc_bytealign (w7[2], w7[3], offset); + c7[1] = hc_bytealign (w7[1], w7[2], offset); + c7[0] = hc_bytealign (w7[0], w7[1], offset); + c6[3] = hc_bytealign (w6[3], w7[0], offset); + c6[2] = hc_bytealign (w6[2], w6[3], offset); + c6[1] = hc_bytealign (w6[1], w6[2], offset); + c6[0] = hc_bytealign (w6[0], w6[1], offset); + c5[3] = hc_bytealign (w5[3], w6[0], offset); + c5[2] = hc_bytealign (w5[2], w5[3], offset); + c5[1] = hc_bytealign (w5[1], w5[2], offset); + c5[0] = hc_bytealign (w5[0], w5[1], offset); + c4[3] = hc_bytealign (w4[3], w5[0], offset); + c4[2] = hc_bytealign (w4[2], w4[3], offset); + c4[1] = hc_bytealign (w4[1], w4[2], offset); + c4[0] = hc_bytealign (w4[0], w4[1], offset); + c3[3] = hc_bytealign (w3[3], w4[0], offset); + c3[2] = hc_bytealign (w3[2], w3[3], offset); + c3[1] = hc_bytealign (w3[1], w3[2], offset); + c3[0] = hc_bytealign (w3[0], w3[1], offset); + c2[3] = hc_bytealign (w2[3], w3[0], offset); + c2[2] = hc_bytealign (w2[2], w2[3], offset); + c2[1] = hc_bytealign (w2[1], w2[2], offset); + c2[0] = hc_bytealign (w2[0], w2[1], offset); + c1[3] = hc_bytealign (w1[3], w2[0], offset); + c1[2] = hc_bytealign (w1[2], w1[3], offset); + c1[1] = hc_bytealign (w1[1], w1[2], offset); + c1[0] = hc_bytealign (w1[0], w1[1], offset); + c0[3] = hc_bytealign (w0[3], w1[0], offset); + c0[2] = hc_bytealign (w0[2], w0[3], offset); + c0[1] = hc_bytealign (w0[1], w0[2], offset); + c0[0] = hc_bytealign (w0[0], w0[1], offset); + w7[3] = hc_bytealign ( 0, w0[0], offset); + w7[2] = 0; + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + } + #endif + + #if (defined IS_AMD && HAS_VPERM == 1) || defined IS_NV + + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + + #if defined IS_NV + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + #endif + + #if defined IS_AMD + const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); + #endif + + switch (offset_switch) + { + case 0: + c0[0] = hc_byte_perm (w7[3], 0, selector); + w7[3] = hc_byte_perm (w7[2], w7[3], selector); + w7[2] = hc_byte_perm (w7[1], w7[2], selector); + w7[1] = hc_byte_perm (w7[0], w7[1], selector); + w7[0] = hc_byte_perm (w6[3], w7[0], selector); + w6[3] = hc_byte_perm (w6[2], w6[3], selector); + w6[2] = hc_byte_perm (w6[1], w6[2], selector); + w6[1] = hc_byte_perm (w6[0], w6[1], selector); + w6[0] = hc_byte_perm (w5[3], w6[0], selector); + w5[3] = hc_byte_perm (w5[2], w5[3], selector); + w5[2] = hc_byte_perm (w5[1], w5[2], selector); + w5[1] = hc_byte_perm (w5[0], w5[1], selector); + w5[0] = hc_byte_perm (w4[3], w5[0], selector); + w4[3] = hc_byte_perm (w4[2], w4[3], selector); + w4[2] = hc_byte_perm (w4[1], w4[2], selector); + w4[1] = hc_byte_perm (w4[0], w4[1], selector); + w4[0] = hc_byte_perm (w3[3], w4[0], selector); + w3[3] = hc_byte_perm (w3[2], w3[3], selector); + w3[2] = hc_byte_perm (w3[1], w3[2], selector); + w3[1] = hc_byte_perm (w3[0], w3[1], selector); + w3[0] = hc_byte_perm (w2[3], w3[0], selector); + w2[3] = hc_byte_perm (w2[2], w2[3], selector); + w2[2] = hc_byte_perm (w2[1], w2[2], selector); + w2[1] = hc_byte_perm (w2[0], w2[1], selector); + w2[0] = hc_byte_perm (w1[3], w2[0], selector); + w1[3] = hc_byte_perm (w1[2], w1[3], selector); + w1[2] = hc_byte_perm (w1[1], w1[2], selector); + w1[1] = hc_byte_perm (w1[0], w1[1], selector); + w1[0] = hc_byte_perm (w0[3], w1[0], selector); + w0[3] = hc_byte_perm (w0[2], w0[3], selector); + w0[2] = hc_byte_perm (w0[1], w0[2], selector); + w0[1] = hc_byte_perm (w0[0], w0[1], selector); + w0[0] = hc_byte_perm ( 0, w0[0], selector); + + break; + + case 1: + c0[1] = hc_byte_perm (w7[3], 0, selector); + c0[0] = hc_byte_perm (w7[2], w7[3], selector); + w7[3] = hc_byte_perm (w7[1], w7[2], selector); + w7[2] = hc_byte_perm (w7[0], w7[1], selector); + w7[1] = hc_byte_perm (w6[3], w7[0], selector); + w7[0] = hc_byte_perm (w6[2], w6[3], selector); + w6[3] = hc_byte_perm (w6[1], w6[2], selector); + w6[2] = hc_byte_perm (w6[0], w6[1], selector); + w6[1] = hc_byte_perm (w5[3], w6[0], selector); + w6[0] = hc_byte_perm (w5[2], w5[3], selector); + w5[3] = hc_byte_perm (w5[1], w5[2], selector); + w5[2] = hc_byte_perm (w5[0], w5[1], selector); + w5[1] = hc_byte_perm (w4[3], w5[0], selector); + w5[0] = hc_byte_perm (w4[2], w4[3], selector); + w4[3] = hc_byte_perm (w4[1], w4[2], selector); + w4[2] = hc_byte_perm (w4[0], w4[1], selector); + w4[1] = hc_byte_perm (w3[3], w4[0], selector); + w4[0] = hc_byte_perm (w3[2], w3[3], selector); + w3[3] = hc_byte_perm (w3[1], w3[2], selector); + w3[2] = hc_byte_perm (w3[0], w3[1], selector); + w3[1] = hc_byte_perm (w2[3], w3[0], selector); + w3[0] = hc_byte_perm (w2[2], w2[3], selector); + w2[3] = hc_byte_perm (w2[1], w2[2], selector); + w2[2] = hc_byte_perm (w2[0], w2[1], selector); + w2[1] = hc_byte_perm (w1[3], w2[0], selector); + w2[0] = hc_byte_perm (w1[2], w1[3], selector); + w1[3] = hc_byte_perm (w1[1], w1[2], selector); + w1[2] = hc_byte_perm (w1[0], w1[1], selector); + w1[1] = hc_byte_perm (w0[3], w1[0], selector); + w1[0] = hc_byte_perm (w0[2], w0[3], selector); + w0[3] = hc_byte_perm (w0[1], w0[2], selector); + w0[2] = hc_byte_perm (w0[0], w0[1], selector); + w0[1] = hc_byte_perm ( 0, w0[0], selector); + w0[0] = 0; + + break; + + case 2: + c0[2] = hc_byte_perm (w7[3], 0, selector); + c0[1] = hc_byte_perm (w7[2], w7[3], selector); + c0[0] = hc_byte_perm (w7[1], w7[2], selector); + w7[3] = hc_byte_perm (w7[0], w7[1], selector); + w7[2] = hc_byte_perm (w6[3], w7[0], selector); + w7[1] = hc_byte_perm (w6[2], w6[3], selector); + w7[0] = hc_byte_perm (w6[1], w6[2], selector); + w6[3] = hc_byte_perm (w6[0], w6[1], selector); + w6[2] = hc_byte_perm (w5[3], w6[0], selector); + w6[1] = hc_byte_perm (w5[2], w5[3], selector); + w6[0] = hc_byte_perm (w5[1], w5[2], selector); + w5[3] = hc_byte_perm (w5[0], w5[1], selector); + w5[2] = hc_byte_perm (w4[3], w5[0], selector); + w5[1] = hc_byte_perm (w4[2], w4[3], selector); + w5[0] = hc_byte_perm (w4[1], w4[2], selector); + w4[3] = hc_byte_perm (w4[0], w4[1], selector); + w4[2] = hc_byte_perm (w3[3], w4[0], selector); + w4[1] = hc_byte_perm (w3[2], w3[3], selector); + w4[0] = hc_byte_perm (w3[1], w3[2], selector); + w3[3] = hc_byte_perm (w3[0], w3[1], selector); + w3[2] = hc_byte_perm (w2[3], w3[0], selector); + w3[1] = hc_byte_perm (w2[2], w2[3], selector); + w3[0] = hc_byte_perm (w2[1], w2[2], selector); + w2[3] = hc_byte_perm (w2[0], w2[1], selector); + w2[2] = hc_byte_perm (w1[3], w2[0], selector); + w2[1] = hc_byte_perm (w1[2], w1[3], selector); + w2[0] = hc_byte_perm (w1[1], w1[2], selector); + w1[3] = hc_byte_perm (w1[0], w1[1], selector); + w1[2] = hc_byte_perm (w0[3], w1[0], selector); + w1[1] = hc_byte_perm (w0[2], w0[3], selector); + w1[0] = hc_byte_perm (w0[1], w0[2], selector); + w0[3] = hc_byte_perm (w0[0], w0[1], selector); + w0[2] = hc_byte_perm ( 0, w0[0], selector); + w0[1] = 0; + w0[0] = 0; + + break; + + case 3: + c0[3] = hc_byte_perm (w7[3], 0, selector); + c0[2] = hc_byte_perm (w7[2], w7[3], selector); + c0[1] = hc_byte_perm (w7[1], w7[2], selector); + c0[0] = hc_byte_perm (w7[0], w7[1], selector); + w7[3] = hc_byte_perm (w6[3], w7[0], selector); + w7[2] = hc_byte_perm (w6[2], w6[3], selector); + w7[1] = hc_byte_perm (w6[1], w6[2], selector); + w7[0] = hc_byte_perm (w6[0], w6[1], selector); + w6[3] = hc_byte_perm (w5[3], w6[0], selector); + w6[2] = hc_byte_perm (w5[2], w5[3], selector); + w6[1] = hc_byte_perm (w5[1], w5[2], selector); + w6[0] = hc_byte_perm (w5[0], w5[1], selector); + w5[3] = hc_byte_perm (w4[3], w5[0], selector); + w5[2] = hc_byte_perm (w4[2], w4[3], selector); + w5[1] = hc_byte_perm (w4[1], w4[2], selector); + w5[0] = hc_byte_perm (w4[0], w4[1], selector); + w4[3] = hc_byte_perm (w3[3], w4[0], selector); + w4[2] = hc_byte_perm (w3[2], w3[3], selector); + w4[1] = hc_byte_perm (w3[1], w3[2], selector); + w4[0] = hc_byte_perm (w3[0], w3[1], selector); + w3[3] = hc_byte_perm (w2[3], w3[0], selector); + w3[2] = hc_byte_perm (w2[2], w2[3], selector); + w3[1] = hc_byte_perm (w2[1], w2[2], selector); + w3[0] = hc_byte_perm (w2[0], w2[1], selector); + w2[3] = hc_byte_perm (w1[3], w2[0], selector); + w2[2] = hc_byte_perm (w1[2], w1[3], selector); + w2[1] = hc_byte_perm (w1[1], w1[2], selector); + w2[0] = hc_byte_perm (w1[0], w1[1], selector); + w1[3] = hc_byte_perm (w0[3], w1[0], selector); + w1[2] = hc_byte_perm (w0[2], w0[3], selector); + w1[1] = hc_byte_perm (w0[1], w0[2], selector); + w1[0] = hc_byte_perm (w0[0], w0[1], selector); + w0[3] = hc_byte_perm ( 0, w0[0], selector); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 4: + c1[0] = hc_byte_perm (w7[3], 0, selector); + c0[3] = hc_byte_perm (w7[2], w7[3], selector); + c0[2] = hc_byte_perm (w7[1], w7[2], selector); + c0[1] = hc_byte_perm (w7[0], w7[1], selector); + c0[0] = hc_byte_perm (w6[3], w7[0], selector); + w7[3] = hc_byte_perm (w6[2], w6[3], selector); + w7[2] = hc_byte_perm (w6[1], w6[2], selector); + w7[1] = hc_byte_perm (w6[0], w6[1], selector); + w7[0] = hc_byte_perm (w5[3], w6[0], selector); + w6[3] = hc_byte_perm (w5[2], w5[3], selector); + w6[2] = hc_byte_perm (w5[1], w5[2], selector); + w6[1] = hc_byte_perm (w5[0], w5[1], selector); + w6[0] = hc_byte_perm (w4[3], w5[0], selector); + w5[3] = hc_byte_perm (w4[2], w4[3], selector); + w5[2] = hc_byte_perm (w4[1], w4[2], selector); + w5[1] = hc_byte_perm (w4[0], w4[1], selector); + w5[0] = hc_byte_perm (w3[3], w4[0], selector); + w4[3] = hc_byte_perm (w3[2], w3[3], selector); + w4[2] = hc_byte_perm (w3[1], w3[2], selector); + w4[1] = hc_byte_perm (w3[0], w3[1], selector); + w4[0] = hc_byte_perm (w2[3], w3[0], selector); + w3[3] = hc_byte_perm (w2[2], w2[3], selector); + w3[2] = hc_byte_perm (w2[1], w2[2], selector); + w3[1] = hc_byte_perm (w2[0], w2[1], selector); + w3[0] = hc_byte_perm (w1[3], w2[0], selector); + w2[3] = hc_byte_perm (w1[2], w1[3], selector); + w2[2] = hc_byte_perm (w1[1], w1[2], selector); + w2[1] = hc_byte_perm (w1[0], w1[1], selector); + w2[0] = hc_byte_perm (w0[3], w1[0], selector); + w1[3] = hc_byte_perm (w0[2], w0[3], selector); + w1[2] = hc_byte_perm (w0[1], w0[2], selector); + w1[1] = hc_byte_perm (w0[0], w0[1], selector); + w1[0] = hc_byte_perm ( 0, w0[0], selector); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 5: + c1[1] = hc_byte_perm (w7[3], 0, selector); + c1[0] = hc_byte_perm (w7[2], w7[3], selector); + c0[3] = hc_byte_perm (w7[1], w7[2], selector); + c0[2] = hc_byte_perm (w7[0], w7[1], selector); + c0[1] = hc_byte_perm (w6[3], w7[0], selector); + c0[0] = hc_byte_perm (w6[2], w6[3], selector); + w7[3] = hc_byte_perm (w6[1], w6[2], selector); + w7[2] = hc_byte_perm (w6[0], w6[1], selector); + w7[1] = hc_byte_perm (w5[3], w6[0], selector); + w7[0] = hc_byte_perm (w5[2], w5[3], selector); + w6[3] = hc_byte_perm (w5[1], w5[2], selector); + w6[2] = hc_byte_perm (w5[0], w5[1], selector); + w6[1] = hc_byte_perm (w4[3], w5[0], selector); + w6[0] = hc_byte_perm (w4[2], w4[3], selector); + w5[3] = hc_byte_perm (w4[1], w4[2], selector); + w5[2] = hc_byte_perm (w4[0], w4[1], selector); + w5[1] = hc_byte_perm (w3[3], w4[0], selector); + w5[0] = hc_byte_perm (w3[2], w3[3], selector); + w4[3] = hc_byte_perm (w3[1], w3[2], selector); + w4[2] = hc_byte_perm (w3[0], w3[1], selector); + w4[1] = hc_byte_perm (w2[3], w3[0], selector); + w4[0] = hc_byte_perm (w2[2], w2[3], selector); + w3[3] = hc_byte_perm (w2[1], w2[2], selector); + w3[2] = hc_byte_perm (w2[0], w2[1], selector); + w3[1] = hc_byte_perm (w1[3], w2[0], selector); + w3[0] = hc_byte_perm (w1[2], w1[3], selector); + w2[3] = hc_byte_perm (w1[1], w1[2], selector); + w2[2] = hc_byte_perm (w1[0], w1[1], selector); + w2[1] = hc_byte_perm (w0[3], w1[0], selector); + w2[0] = hc_byte_perm (w0[2], w0[3], selector); + w1[3] = hc_byte_perm (w0[1], w0[2], selector); + w1[2] = hc_byte_perm (w0[0], w0[1], selector); + w1[1] = hc_byte_perm ( 0, w0[0], selector); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 6: + c1[2] = hc_byte_perm (w7[3], 0, selector); + c1[1] = hc_byte_perm (w7[2], w7[3], selector); + c1[0] = hc_byte_perm (w7[1], w7[2], selector); + c0[3] = hc_byte_perm (w7[0], w7[1], selector); + c0[2] = hc_byte_perm (w6[3], w7[0], selector); + c0[1] = hc_byte_perm (w6[2], w6[3], selector); + c0[0] = hc_byte_perm (w6[1], w6[2], selector); + w7[3] = hc_byte_perm (w6[0], w6[1], selector); + w7[2] = hc_byte_perm (w5[3], w6[0], selector); + w7[1] = hc_byte_perm (w5[2], w5[3], selector); + w7[0] = hc_byte_perm (w5[1], w5[2], selector); + w6[3] = hc_byte_perm (w5[0], w5[1], selector); + w6[2] = hc_byte_perm (w4[3], w5[0], selector); + w6[1] = hc_byte_perm (w4[2], w4[3], selector); + w6[0] = hc_byte_perm (w4[1], w4[2], selector); + w5[3] = hc_byte_perm (w4[0], w4[1], selector); + w5[2] = hc_byte_perm (w3[3], w4[0], selector); + w5[1] = hc_byte_perm (w3[2], w3[3], selector); + w5[0] = hc_byte_perm (w3[1], w3[2], selector); + w4[3] = hc_byte_perm (w3[0], w3[1], selector); + w4[2] = hc_byte_perm (w2[3], w3[0], selector); + w4[1] = hc_byte_perm (w2[2], w2[3], selector); + w4[0] = hc_byte_perm (w2[1], w2[2], selector); + w3[3] = hc_byte_perm (w2[0], w2[1], selector); + w3[2] = hc_byte_perm (w1[3], w2[0], selector); + w3[1] = hc_byte_perm (w1[2], w1[3], selector); + w3[0] = hc_byte_perm (w1[1], w1[2], selector); + w2[3] = hc_byte_perm (w1[0], w1[1], selector); + w2[2] = hc_byte_perm (w0[3], w1[0], selector); + w2[1] = hc_byte_perm (w0[2], w0[3], selector); + w2[0] = hc_byte_perm (w0[1], w0[2], selector); + w1[3] = hc_byte_perm (w0[0], w0[1], selector); + w1[2] = hc_byte_perm ( 0, w0[0], selector); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 7: + c1[3] = hc_byte_perm (w7[3], 0, selector); + c1[2] = hc_byte_perm (w7[2], w7[3], selector); + c1[1] = hc_byte_perm (w7[1], w7[2], selector); + c1[0] = hc_byte_perm (w7[0], w7[1], selector); + c0[3] = hc_byte_perm (w6[3], w7[0], selector); + c0[2] = hc_byte_perm (w6[2], w6[3], selector); + c0[1] = hc_byte_perm (w6[1], w6[2], selector); + c0[0] = hc_byte_perm (w6[0], w6[1], selector); + w7[3] = hc_byte_perm (w5[3], w6[0], selector); + w7[2] = hc_byte_perm (w5[2], w5[3], selector); + w7[1] = hc_byte_perm (w5[1], w5[2], selector); + w7[0] = hc_byte_perm (w5[0], w5[1], selector); + w6[3] = hc_byte_perm (w4[3], w5[0], selector); + w6[2] = hc_byte_perm (w4[2], w4[3], selector); + w6[1] = hc_byte_perm (w4[1], w4[2], selector); + w6[0] = hc_byte_perm (w4[0], w4[1], selector); + w5[3] = hc_byte_perm (w3[3], w4[0], selector); + w5[2] = hc_byte_perm (w3[2], w3[3], selector); + w5[1] = hc_byte_perm (w3[1], w3[2], selector); + w5[0] = hc_byte_perm (w3[0], w3[1], selector); + w4[3] = hc_byte_perm (w2[3], w3[0], selector); + w4[2] = hc_byte_perm (w2[2], w2[3], selector); + w4[1] = hc_byte_perm (w2[1], w2[2], selector); + w4[0] = hc_byte_perm (w2[0], w2[1], selector); + w3[3] = hc_byte_perm (w1[3], w2[0], selector); + w3[2] = hc_byte_perm (w1[2], w1[3], selector); + w3[1] = hc_byte_perm (w1[1], w1[2], selector); + w3[0] = hc_byte_perm (w1[0], w1[1], selector); + w2[3] = hc_byte_perm (w0[3], w1[0], selector); + w2[2] = hc_byte_perm (w0[2], w0[3], selector); + w2[1] = hc_byte_perm (w0[1], w0[2], selector); + w2[0] = hc_byte_perm (w0[0], w0[1], selector); + w1[3] = hc_byte_perm ( 0, w0[0], selector); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 8: + c2[0] = hc_byte_perm (w7[3], 0, selector); + c1[3] = hc_byte_perm (w7[2], w7[3], selector); + c1[2] = hc_byte_perm (w7[1], w7[2], selector); + c1[1] = hc_byte_perm (w7[0], w7[1], selector); + c1[0] = hc_byte_perm (w6[3], w7[0], selector); + c0[3] = hc_byte_perm (w6[2], w6[3], selector); + c0[2] = hc_byte_perm (w6[1], w6[2], selector); + c0[1] = hc_byte_perm (w6[0], w6[1], selector); + c0[0] = hc_byte_perm (w5[3], w6[0], selector); + w7[3] = hc_byte_perm (w5[2], w5[3], selector); + w7[2] = hc_byte_perm (w5[1], w5[2], selector); + w7[1] = hc_byte_perm (w5[0], w5[1], selector); + w7[0] = hc_byte_perm (w4[3], w5[0], selector); + w6[3] = hc_byte_perm (w4[2], w4[3], selector); + w6[2] = hc_byte_perm (w4[1], w4[2], selector); + w6[1] = hc_byte_perm (w4[0], w4[1], selector); + w6[0] = hc_byte_perm (w3[3], w4[0], selector); + w5[3] = hc_byte_perm (w3[2], w3[3], selector); + w5[2] = hc_byte_perm (w3[1], w3[2], selector); + w5[1] = hc_byte_perm (w3[0], w3[1], selector); + w5[0] = hc_byte_perm (w2[3], w3[0], selector); + w4[3] = hc_byte_perm (w2[2], w2[3], selector); + w4[2] = hc_byte_perm (w2[1], w2[2], selector); + w4[1] = hc_byte_perm (w2[0], w2[1], selector); + w4[0] = hc_byte_perm (w1[3], w2[0], selector); + w3[3] = hc_byte_perm (w1[2], w1[3], selector); + w3[2] = hc_byte_perm (w1[1], w1[2], selector); + w3[1] = hc_byte_perm (w1[0], w1[1], selector); + w3[0] = hc_byte_perm (w0[3], w1[0], selector); + w2[3] = hc_byte_perm (w0[2], w0[3], selector); + w2[2] = hc_byte_perm (w0[1], w0[2], selector); + w2[1] = hc_byte_perm (w0[0], w0[1], selector); + w2[0] = hc_byte_perm ( 0, w0[0], selector); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 9: + c2[1] = hc_byte_perm (w7[3], 0, selector); + c2[0] = hc_byte_perm (w7[2], w7[3], selector); + c1[3] = hc_byte_perm (w7[1], w7[2], selector); + c1[2] = hc_byte_perm (w7[0], w7[1], selector); + c1[1] = hc_byte_perm (w6[3], w7[0], selector); + c1[0] = hc_byte_perm (w6[2], w6[3], selector); + c0[3] = hc_byte_perm (w6[1], w6[2], selector); + c0[2] = hc_byte_perm (w6[0], w6[1], selector); + c0[1] = hc_byte_perm (w5[3], w6[0], selector); + c0[0] = hc_byte_perm (w5[2], w5[3], selector); + w7[3] = hc_byte_perm (w5[1], w5[2], selector); + w7[2] = hc_byte_perm (w5[0], w5[1], selector); + w7[1] = hc_byte_perm (w4[3], w5[0], selector); + w7[0] = hc_byte_perm (w4[2], w4[3], selector); + w6[3] = hc_byte_perm (w4[1], w4[2], selector); + w6[2] = hc_byte_perm (w4[0], w4[1], selector); + w6[1] = hc_byte_perm (w3[3], w4[0], selector); + w6[0] = hc_byte_perm (w3[2], w3[3], selector); + w5[3] = hc_byte_perm (w3[1], w3[2], selector); + w5[2] = hc_byte_perm (w3[0], w3[1], selector); + w5[1] = hc_byte_perm (w2[3], w3[0], selector); + w5[0] = hc_byte_perm (w2[2], w2[3], selector); + w4[3] = hc_byte_perm (w2[1], w2[2], selector); + w4[2] = hc_byte_perm (w2[0], w2[1], selector); + w4[1] = hc_byte_perm (w1[3], w2[0], selector); + w4[0] = hc_byte_perm (w1[2], w1[3], selector); + w3[3] = hc_byte_perm (w1[1], w1[2], selector); + w3[2] = hc_byte_perm (w1[0], w1[1], selector); + w3[1] = hc_byte_perm (w0[3], w1[0], selector); + w3[0] = hc_byte_perm (w0[2], w0[3], selector); + w2[3] = hc_byte_perm (w0[1], w0[2], selector); + w2[2] = hc_byte_perm (w0[0], w0[1], selector); + w2[1] = hc_byte_perm ( 0, w0[0], selector); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 10: + c2[2] = hc_byte_perm (w7[3], 0, selector); + c2[1] = hc_byte_perm (w7[2], w7[3], selector); + c2[0] = hc_byte_perm (w7[1], w7[2], selector); + c1[3] = hc_byte_perm (w7[0], w7[1], selector); + c1[2] = hc_byte_perm (w6[3], w7[0], selector); + c1[1] = hc_byte_perm (w6[2], w6[3], selector); + c1[0] = hc_byte_perm (w6[1], w6[2], selector); + c0[3] = hc_byte_perm (w6[0], w6[1], selector); + c0[2] = hc_byte_perm (w5[3], w6[0], selector); + c0[1] = hc_byte_perm (w5[2], w5[3], selector); + c0[0] = hc_byte_perm (w5[1], w5[2], selector); + w7[3] = hc_byte_perm (w5[0], w5[1], selector); + w7[2] = hc_byte_perm (w4[3], w5[0], selector); + w7[1] = hc_byte_perm (w4[2], w4[3], selector); + w7[0] = hc_byte_perm (w4[1], w4[2], selector); + w6[3] = hc_byte_perm (w4[0], w4[1], selector); + w6[2] = hc_byte_perm (w3[3], w4[0], selector); + w6[1] = hc_byte_perm (w3[2], w3[3], selector); + w6[0] = hc_byte_perm (w3[1], w3[2], selector); + w5[3] = hc_byte_perm (w3[0], w3[1], selector); + w5[2] = hc_byte_perm (w2[3], w3[0], selector); + w5[1] = hc_byte_perm (w2[2], w2[3], selector); + w5[0] = hc_byte_perm (w2[1], w2[2], selector); + w4[3] = hc_byte_perm (w2[0], w2[1], selector); + w4[2] = hc_byte_perm (w1[3], w2[0], selector); + w4[1] = hc_byte_perm (w1[2], w1[3], selector); + w4[0] = hc_byte_perm (w1[1], w1[2], selector); + w3[3] = hc_byte_perm (w1[0], w1[1], selector); + w3[2] = hc_byte_perm (w0[3], w1[0], selector); + w3[1] = hc_byte_perm (w0[2], w0[3], selector); + w3[0] = hc_byte_perm (w0[1], w0[2], selector); + w2[3] = hc_byte_perm (w0[0], w0[1], selector); + w2[2] = hc_byte_perm ( 0, w0[0], selector); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 11: + c2[3] = hc_byte_perm (w7[3], 0, selector); + c2[2] = hc_byte_perm (w7[2], w7[3], selector); + c2[1] = hc_byte_perm (w7[1], w7[2], selector); + c2[0] = hc_byte_perm (w7[0], w7[1], selector); + c1[3] = hc_byte_perm (w6[3], w7[0], selector); + c1[2] = hc_byte_perm (w6[2], w6[3], selector); + c1[1] = hc_byte_perm (w6[1], w6[2], selector); + c1[0] = hc_byte_perm (w6[0], w6[1], selector); + c0[3] = hc_byte_perm (w5[3], w6[0], selector); + c0[2] = hc_byte_perm (w5[2], w5[3], selector); + c0[1] = hc_byte_perm (w5[1], w5[2], selector); + c0[0] = hc_byte_perm (w5[0], w5[1], selector); + w7[3] = hc_byte_perm (w4[3], w5[0], selector); + w7[2] = hc_byte_perm (w4[2], w4[3], selector); + w7[1] = hc_byte_perm (w4[1], w4[2], selector); + w7[0] = hc_byte_perm (w4[0], w4[1], selector); + w6[3] = hc_byte_perm (w3[3], w4[0], selector); + w6[2] = hc_byte_perm (w3[2], w3[3], selector); + w6[1] = hc_byte_perm (w3[1], w3[2], selector); + w6[0] = hc_byte_perm (w3[0], w3[1], selector); + w5[3] = hc_byte_perm (w2[3], w3[0], selector); + w5[2] = hc_byte_perm (w2[2], w2[3], selector); + w5[1] = hc_byte_perm (w2[1], w2[2], selector); + w5[0] = hc_byte_perm (w2[0], w2[1], selector); + w4[3] = hc_byte_perm (w1[3], w2[0], selector); + w4[2] = hc_byte_perm (w1[2], w1[3], selector); + w4[1] = hc_byte_perm (w1[1], w1[2], selector); + w4[0] = hc_byte_perm (w1[0], w1[1], selector); + w3[3] = hc_byte_perm (w0[3], w1[0], selector); + w3[2] = hc_byte_perm (w0[2], w0[3], selector); + w3[1] = hc_byte_perm (w0[1], w0[2], selector); + w3[0] = hc_byte_perm (w0[0], w0[1], selector); + w2[3] = hc_byte_perm ( 0, w0[0], selector); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 12: + c3[0] = hc_byte_perm (w7[3], 0, selector); + c2[3] = hc_byte_perm (w7[2], w7[3], selector); + c2[2] = hc_byte_perm (w7[1], w7[2], selector); + c2[1] = hc_byte_perm (w7[0], w7[1], selector); + c2[0] = hc_byte_perm (w6[3], w7[0], selector); + c1[3] = hc_byte_perm (w6[2], w6[3], selector); + c1[2] = hc_byte_perm (w6[1], w6[2], selector); + c1[1] = hc_byte_perm (w6[0], w6[1], selector); + c1[0] = hc_byte_perm (w5[3], w6[0], selector); + c0[3] = hc_byte_perm (w5[2], w5[3], selector); + c0[2] = hc_byte_perm (w5[1], w5[2], selector); + c0[1] = hc_byte_perm (w5[0], w5[1], selector); + c0[0] = hc_byte_perm (w4[3], w5[0], selector); + w7[3] = hc_byte_perm (w4[2], w4[3], selector); + w7[2] = hc_byte_perm (w4[1], w4[2], selector); + w7[1] = hc_byte_perm (w4[0], w4[1], selector); + w7[0] = hc_byte_perm (w3[3], w4[0], selector); + w6[3] = hc_byte_perm (w3[2], w3[3], selector); + w6[2] = hc_byte_perm (w3[1], w3[2], selector); + w6[1] = hc_byte_perm (w3[0], w3[1], selector); + w6[0] = hc_byte_perm (w2[3], w3[0], selector); + w5[3] = hc_byte_perm (w2[2], w2[3], selector); + w5[2] = hc_byte_perm (w2[1], w2[2], selector); + w5[1] = hc_byte_perm (w2[0], w2[1], selector); + w5[0] = hc_byte_perm (w1[3], w2[0], selector); + w4[3] = hc_byte_perm (w1[2], w1[3], selector); + w4[2] = hc_byte_perm (w1[1], w1[2], selector); + w4[1] = hc_byte_perm (w1[0], w1[1], selector); + w4[0] = hc_byte_perm (w0[3], w1[0], selector); + w3[3] = hc_byte_perm (w0[2], w0[3], selector); + w3[2] = hc_byte_perm (w0[1], w0[2], selector); + w3[1] = hc_byte_perm (w0[0], w0[1], selector); + w3[0] = hc_byte_perm ( 0, w0[0], selector); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 13: + c3[1] = hc_byte_perm (w7[3], 0, selector); + c3[0] = hc_byte_perm (w7[2], w7[3], selector); + c2[3] = hc_byte_perm (w7[1], w7[2], selector); + c2[2] = hc_byte_perm (w7[0], w7[1], selector); + c2[1] = hc_byte_perm (w6[3], w7[0], selector); + c2[0] = hc_byte_perm (w6[2], w6[3], selector); + c1[3] = hc_byte_perm (w6[1], w6[2], selector); + c1[2] = hc_byte_perm (w6[0], w6[1], selector); + c1[1] = hc_byte_perm (w5[3], w6[0], selector); + c1[0] = hc_byte_perm (w5[2], w5[3], selector); + c0[3] = hc_byte_perm (w5[1], w5[2], selector); + c0[2] = hc_byte_perm (w5[0], w5[1], selector); + c0[1] = hc_byte_perm (w4[3], w5[0], selector); + c0[0] = hc_byte_perm (w4[2], w4[3], selector); + w7[3] = hc_byte_perm (w4[1], w4[2], selector); + w7[2] = hc_byte_perm (w4[0], w4[1], selector); + w7[1] = hc_byte_perm (w3[3], w4[0], selector); + w7[0] = hc_byte_perm (w3[2], w3[3], selector); + w6[3] = hc_byte_perm (w3[1], w3[2], selector); + w6[2] = hc_byte_perm (w3[0], w3[1], selector); + w6[1] = hc_byte_perm (w2[3], w3[0], selector); + w6[0] = hc_byte_perm (w2[2], w2[3], selector); + w5[3] = hc_byte_perm (w2[1], w2[2], selector); + w5[2] = hc_byte_perm (w2[0], w2[1], selector); + w5[1] = hc_byte_perm (w1[3], w2[0], selector); + w5[0] = hc_byte_perm (w1[2], w1[3], selector); + w4[3] = hc_byte_perm (w1[1], w1[2], selector); + w4[2] = hc_byte_perm (w1[0], w1[1], selector); + w4[1] = hc_byte_perm (w0[3], w1[0], selector); + w4[0] = hc_byte_perm (w0[2], w0[3], selector); + w3[3] = hc_byte_perm (w0[1], w0[2], selector); + w3[2] = hc_byte_perm (w0[0], w0[1], selector); + w3[1] = hc_byte_perm ( 0, w0[0], selector); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 14: + c3[2] = hc_byte_perm (w7[3], 0, selector); + c3[1] = hc_byte_perm (w7[2], w7[3], selector); + c3[0] = hc_byte_perm (w7[1], w7[2], selector); + c2[3] = hc_byte_perm (w7[0], w7[1], selector); + c2[2] = hc_byte_perm (w6[3], w7[0], selector); + c2[1] = hc_byte_perm (w6[2], w6[3], selector); + c2[0] = hc_byte_perm (w6[1], w6[2], selector); + c1[3] = hc_byte_perm (w6[0], w6[1], selector); + c1[2] = hc_byte_perm (w5[3], w6[0], selector); + c1[1] = hc_byte_perm (w5[2], w5[3], selector); + c1[0] = hc_byte_perm (w5[1], w5[2], selector); + c0[3] = hc_byte_perm (w5[0], w5[1], selector); + c0[2] = hc_byte_perm (w4[3], w5[0], selector); + c0[1] = hc_byte_perm (w4[2], w4[3], selector); + c0[0] = hc_byte_perm (w4[1], w4[2], selector); + w7[3] = hc_byte_perm (w4[0], w4[1], selector); + w7[2] = hc_byte_perm (w3[3], w4[0], selector); + w7[1] = hc_byte_perm (w3[2], w3[3], selector); + w7[0] = hc_byte_perm (w3[1], w3[2], selector); + w6[3] = hc_byte_perm (w3[0], w3[1], selector); + w6[2] = hc_byte_perm (w2[3], w3[0], selector); + w6[1] = hc_byte_perm (w2[2], w2[3], selector); + w6[0] = hc_byte_perm (w2[1], w2[2], selector); + w5[3] = hc_byte_perm (w2[0], w2[1], selector); + w5[2] = hc_byte_perm (w1[3], w2[0], selector); + w5[1] = hc_byte_perm (w1[2], w1[3], selector); + w5[0] = hc_byte_perm (w1[1], w1[2], selector); + w4[3] = hc_byte_perm (w1[0], w1[1], selector); + w4[2] = hc_byte_perm (w0[3], w1[0], selector); + w4[1] = hc_byte_perm (w0[2], w0[3], selector); + w4[0] = hc_byte_perm (w0[1], w0[2], selector); + w3[3] = hc_byte_perm (w0[0], w0[1], selector); + w3[2] = hc_byte_perm ( 0, w0[0], selector); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 15: + c3[3] = hc_byte_perm (w7[3], 0, selector); + c3[2] = hc_byte_perm (w7[2], w7[3], selector); + c3[1] = hc_byte_perm (w7[1], w7[2], selector); + c3[0] = hc_byte_perm (w7[0], w7[1], selector); + c2[3] = hc_byte_perm (w6[3], w7[0], selector); + c2[2] = hc_byte_perm (w6[2], w6[3], selector); + c2[1] = hc_byte_perm (w6[1], w6[2], selector); + c2[0] = hc_byte_perm (w6[0], w6[1], selector); + c1[3] = hc_byte_perm (w5[3], w6[0], selector); + c1[2] = hc_byte_perm (w5[2], w5[3], selector); + c1[1] = hc_byte_perm (w5[1], w5[2], selector); + c1[0] = hc_byte_perm (w5[0], w5[1], selector); + c0[3] = hc_byte_perm (w4[3], w5[0], selector); + c0[2] = hc_byte_perm (w4[2], w4[3], selector); + c0[1] = hc_byte_perm (w4[1], w4[2], selector); + c0[0] = hc_byte_perm (w4[0], w4[1], selector); + w7[3] = hc_byte_perm (w3[3], w4[0], selector); + w7[2] = hc_byte_perm (w3[2], w3[3], selector); + w7[1] = hc_byte_perm (w3[1], w3[2], selector); + w7[0] = hc_byte_perm (w3[0], w3[1], selector); + w6[3] = hc_byte_perm (w2[3], w3[0], selector); + w6[2] = hc_byte_perm (w2[2], w2[3], selector); + w6[1] = hc_byte_perm (w2[1], w2[2], selector); + w6[0] = hc_byte_perm (w2[0], w2[1], selector); + w5[3] = hc_byte_perm (w1[3], w2[0], selector); + w5[2] = hc_byte_perm (w1[2], w1[3], selector); + w5[1] = hc_byte_perm (w1[1], w1[2], selector); + w5[0] = hc_byte_perm (w1[0], w1[1], selector); + w4[3] = hc_byte_perm (w0[3], w1[0], selector); + w4[2] = hc_byte_perm (w0[2], w0[3], selector); + w4[1] = hc_byte_perm (w0[1], w0[2], selector); + w4[0] = hc_byte_perm (w0[0], w0[1], selector); + w3[3] = hc_byte_perm ( 0, w0[0], selector); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 16: + c4[0] = hc_byte_perm (w7[3], 0, selector); + c3[3] = hc_byte_perm (w7[2], w7[3], selector); + c3[2] = hc_byte_perm (w7[1], w7[2], selector); + c3[1] = hc_byte_perm (w7[0], w7[1], selector); + c3[0] = hc_byte_perm (w6[3], w7[0], selector); + c2[3] = hc_byte_perm (w6[2], w6[3], selector); + c2[2] = hc_byte_perm (w6[1], w6[2], selector); + c2[1] = hc_byte_perm (w6[0], w6[1], selector); + c2[0] = hc_byte_perm (w5[3], w6[0], selector); + c1[3] = hc_byte_perm (w5[2], w5[3], selector); + c1[2] = hc_byte_perm (w5[1], w5[2], selector); + c1[1] = hc_byte_perm (w5[0], w5[1], selector); + c1[0] = hc_byte_perm (w4[3], w5[0], selector); + c0[3] = hc_byte_perm (w4[2], w4[3], selector); + c0[2] = hc_byte_perm (w4[1], w4[2], selector); + c0[1] = hc_byte_perm (w4[0], w4[1], selector); + c0[0] = hc_byte_perm (w3[3], w4[0], selector); + w7[3] = hc_byte_perm (w3[2], w3[3], selector); + w7[2] = hc_byte_perm (w3[1], w3[2], selector); + w7[1] = hc_byte_perm (w3[0], w3[1], selector); + w7[0] = hc_byte_perm (w2[3], w3[0], selector); + w6[3] = hc_byte_perm (w2[2], w2[3], selector); + w6[2] = hc_byte_perm (w2[1], w2[2], selector); + w6[1] = hc_byte_perm (w2[0], w2[1], selector); + w6[0] = hc_byte_perm (w1[3], w2[0], selector); + w5[3] = hc_byte_perm (w1[2], w1[3], selector); + w5[2] = hc_byte_perm (w1[1], w1[2], selector); + w5[1] = hc_byte_perm (w1[0], w1[1], selector); + w5[0] = hc_byte_perm (w0[3], w1[0], selector); + w4[3] = hc_byte_perm (w0[2], w0[3], selector); + w4[2] = hc_byte_perm (w0[1], w0[2], selector); + w4[1] = hc_byte_perm (w0[0], w0[1], selector); + w4[0] = hc_byte_perm ( 0, w0[0], selector); + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 17: + c4[1] = hc_byte_perm (w7[3], 0, selector); + c4[0] = hc_byte_perm (w7[2], w7[3], selector); + c3[3] = hc_byte_perm (w7[1], w7[2], selector); + c3[2] = hc_byte_perm (w7[0], w7[1], selector); + c3[1] = hc_byte_perm (w6[3], w7[0], selector); + c3[0] = hc_byte_perm (w6[2], w6[3], selector); + c2[3] = hc_byte_perm (w6[1], w6[2], selector); + c2[2] = hc_byte_perm (w6[0], w6[1], selector); + c2[1] = hc_byte_perm (w5[3], w6[0], selector); + c2[0] = hc_byte_perm (w5[2], w5[3], selector); + c1[3] = hc_byte_perm (w5[1], w5[2], selector); + c1[2] = hc_byte_perm (w5[0], w5[1], selector); + c1[1] = hc_byte_perm (w4[3], w5[0], selector); + c1[0] = hc_byte_perm (w4[2], w4[3], selector); + c0[3] = hc_byte_perm (w4[1], w4[2], selector); + c0[2] = hc_byte_perm (w4[0], w4[1], selector); + c0[1] = hc_byte_perm (w3[3], w4[0], selector); + c0[0] = hc_byte_perm (w3[2], w3[3], selector); + w7[3] = hc_byte_perm (w3[1], w3[2], selector); + w7[2] = hc_byte_perm (w3[0], w3[1], selector); + w7[1] = hc_byte_perm (w2[3], w3[0], selector); + w7[0] = hc_byte_perm (w2[2], w2[3], selector); + w6[3] = hc_byte_perm (w2[1], w2[2], selector); + w6[2] = hc_byte_perm (w2[0], w2[1], selector); + w6[1] = hc_byte_perm (w1[3], w2[0], selector); + w6[0] = hc_byte_perm (w1[2], w1[3], selector); + w5[3] = hc_byte_perm (w1[1], w1[2], selector); + w5[2] = hc_byte_perm (w1[0], w1[1], selector); + w5[1] = hc_byte_perm (w0[3], w1[0], selector); + w5[0] = hc_byte_perm (w0[2], w0[3], selector); + w4[3] = hc_byte_perm (w0[1], w0[2], selector); + w4[2] = hc_byte_perm (w0[0], w0[1], selector); + w4[1] = hc_byte_perm ( 0, w0[0], selector); + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 18: + c4[2] = hc_byte_perm (w7[3], 0, selector); + c4[1] = hc_byte_perm (w7[2], w7[3], selector); + c4[0] = hc_byte_perm (w7[1], w7[2], selector); + c3[3] = hc_byte_perm (w7[0], w7[1], selector); + c3[2] = hc_byte_perm (w6[3], w7[0], selector); + c3[1] = hc_byte_perm (w6[2], w6[3], selector); + c3[0] = hc_byte_perm (w6[1], w6[2], selector); + c2[3] = hc_byte_perm (w6[0], w6[1], selector); + c2[2] = hc_byte_perm (w5[3], w6[0], selector); + c2[1] = hc_byte_perm (w5[2], w5[3], selector); + c2[0] = hc_byte_perm (w5[1], w5[2], selector); + c1[3] = hc_byte_perm (w5[0], w5[1], selector); + c1[2] = hc_byte_perm (w4[3], w5[0], selector); + c1[1] = hc_byte_perm (w4[2], w4[3], selector); + c1[0] = hc_byte_perm (w4[1], w4[2], selector); + c0[3] = hc_byte_perm (w4[0], w4[1], selector); + c0[2] = hc_byte_perm (w3[3], w4[0], selector); + c0[1] = hc_byte_perm (w3[2], w3[3], selector); + c0[0] = hc_byte_perm (w3[1], w3[2], selector); + w7[3] = hc_byte_perm (w3[0], w3[1], selector); + w7[2] = hc_byte_perm (w2[3], w3[0], selector); + w7[1] = hc_byte_perm (w2[2], w2[3], selector); + w7[0] = hc_byte_perm (w2[1], w2[2], selector); + w6[3] = hc_byte_perm (w2[0], w2[1], selector); + w6[2] = hc_byte_perm (w1[3], w2[0], selector); + w6[1] = hc_byte_perm (w1[2], w1[3], selector); + w6[0] = hc_byte_perm (w1[1], w1[2], selector); + w5[3] = hc_byte_perm (w1[0], w1[1], selector); + w5[2] = hc_byte_perm (w0[3], w1[0], selector); + w5[1] = hc_byte_perm (w0[2], w0[3], selector); + w5[0] = hc_byte_perm (w0[1], w0[2], selector); + w4[3] = hc_byte_perm (w0[0], w0[1], selector); + w4[2] = hc_byte_perm ( 0, w0[0], selector); + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 19: + c4[3] = hc_byte_perm (w7[3], 0, selector); + c4[2] = hc_byte_perm (w7[2], w7[3], selector); + c4[1] = hc_byte_perm (w7[1], w7[2], selector); + c4[0] = hc_byte_perm (w7[0], w7[1], selector); + c3[3] = hc_byte_perm (w6[3], w7[0], selector); + c3[2] = hc_byte_perm (w6[2], w6[3], selector); + c3[1] = hc_byte_perm (w6[1], w6[2], selector); + c3[0] = hc_byte_perm (w6[0], w6[1], selector); + c2[3] = hc_byte_perm (w5[3], w6[0], selector); + c2[2] = hc_byte_perm (w5[2], w5[3], selector); + c2[1] = hc_byte_perm (w5[1], w5[2], selector); + c2[0] = hc_byte_perm (w5[0], w5[1], selector); + c1[3] = hc_byte_perm (w4[3], w5[0], selector); + c1[2] = hc_byte_perm (w4[2], w4[3], selector); + c1[1] = hc_byte_perm (w4[1], w4[2], selector); + c1[0] = hc_byte_perm (w4[0], w4[1], selector); + c0[3] = hc_byte_perm (w3[3], w4[0], selector); + c0[2] = hc_byte_perm (w3[2], w3[3], selector); + c0[1] = hc_byte_perm (w3[1], w3[2], selector); + c0[0] = hc_byte_perm (w3[0], w3[1], selector); + w7[3] = hc_byte_perm (w2[3], w3[0], selector); + w7[2] = hc_byte_perm (w2[2], w2[3], selector); + w7[1] = hc_byte_perm (w2[1], w2[2], selector); + w7[0] = hc_byte_perm (w2[0], w2[1], selector); + w6[3] = hc_byte_perm (w1[3], w2[0], selector); + w6[2] = hc_byte_perm (w1[2], w1[3], selector); + w6[1] = hc_byte_perm (w1[1], w1[2], selector); + w6[0] = hc_byte_perm (w1[0], w1[1], selector); + w5[3] = hc_byte_perm (w0[3], w1[0], selector); + w5[2] = hc_byte_perm (w0[2], w0[3], selector); + w5[1] = hc_byte_perm (w0[1], w0[2], selector); + w5[0] = hc_byte_perm (w0[0], w0[1], selector); + w4[3] = hc_byte_perm ( 0, w0[0], selector); + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 20: + c5[0] = hc_byte_perm (w7[3], 0, selector); + c4[3] = hc_byte_perm (w7[2], w7[3], selector); + c4[2] = hc_byte_perm (w7[1], w7[2], selector); + c4[1] = hc_byte_perm (w7[0], w7[1], selector); + c4[0] = hc_byte_perm (w6[3], w7[0], selector); + c3[3] = hc_byte_perm (w6[2], w6[3], selector); + c3[2] = hc_byte_perm (w6[1], w6[2], selector); + c3[1] = hc_byte_perm (w6[0], w6[1], selector); + c3[0] = hc_byte_perm (w5[3], w6[0], selector); + c2[3] = hc_byte_perm (w5[2], w5[3], selector); + c2[2] = hc_byte_perm (w5[1], w5[2], selector); + c2[1] = hc_byte_perm (w5[0], w5[1], selector); + c2[0] = hc_byte_perm (w4[3], w5[0], selector); + c1[3] = hc_byte_perm (w4[2], w4[3], selector); + c1[2] = hc_byte_perm (w4[1], w4[2], selector); + c1[1] = hc_byte_perm (w4[0], w4[1], selector); + c1[0] = hc_byte_perm (w3[3], w4[0], selector); + c0[3] = hc_byte_perm (w3[2], w3[3], selector); + c0[2] = hc_byte_perm (w3[1], w3[2], selector); + c0[1] = hc_byte_perm (w3[0], w3[1], selector); + c0[0] = hc_byte_perm (w2[3], w3[0], selector); + w7[3] = hc_byte_perm (w2[2], w2[3], selector); + w7[2] = hc_byte_perm (w2[1], w2[2], selector); + w7[1] = hc_byte_perm (w2[0], w2[1], selector); + w7[0] = hc_byte_perm (w1[3], w2[0], selector); + w6[3] = hc_byte_perm (w1[2], w1[3], selector); + w6[2] = hc_byte_perm (w1[1], w1[2], selector); + w6[1] = hc_byte_perm (w1[0], w1[1], selector); + w6[0] = hc_byte_perm (w0[3], w1[0], selector); + w5[3] = hc_byte_perm (w0[2], w0[3], selector); + w5[2] = hc_byte_perm (w0[1], w0[2], selector); + w5[1] = hc_byte_perm (w0[0], w0[1], selector); + w5[0] = hc_byte_perm ( 0, w0[0], selector); + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 21: + c5[1] = hc_byte_perm (w7[3], 0, selector); + c5[0] = hc_byte_perm (w7[2], w7[3], selector); + c4[3] = hc_byte_perm (w7[1], w7[2], selector); + c4[2] = hc_byte_perm (w7[0], w7[1], selector); + c4[1] = hc_byte_perm (w6[3], w7[0], selector); + c4[0] = hc_byte_perm (w6[2], w6[3], selector); + c3[3] = hc_byte_perm (w6[1], w6[2], selector); + c3[2] = hc_byte_perm (w6[0], w6[1], selector); + c3[1] = hc_byte_perm (w5[3], w6[0], selector); + c3[0] = hc_byte_perm (w5[2], w5[3], selector); + c2[3] = hc_byte_perm (w5[1], w5[2], selector); + c2[2] = hc_byte_perm (w5[0], w5[1], selector); + c2[1] = hc_byte_perm (w4[3], w5[0], selector); + c2[0] = hc_byte_perm (w4[2], w4[3], selector); + c1[3] = hc_byte_perm (w4[1], w4[2], selector); + c1[2] = hc_byte_perm (w4[0], w4[1], selector); + c1[1] = hc_byte_perm (w3[3], w4[0], selector); + c1[0] = hc_byte_perm (w3[2], w3[3], selector); + c0[3] = hc_byte_perm (w3[1], w3[2], selector); + c0[2] = hc_byte_perm (w3[0], w3[1], selector); + c0[1] = hc_byte_perm (w2[3], w3[0], selector); + c0[0] = hc_byte_perm (w2[2], w2[3], selector); + w7[3] = hc_byte_perm (w2[1], w2[2], selector); + w7[2] = hc_byte_perm (w2[0], w2[1], selector); + w7[1] = hc_byte_perm (w1[3], w2[0], selector); + w7[0] = hc_byte_perm (w1[2], w1[3], selector); + w6[3] = hc_byte_perm (w1[1], w1[2], selector); + w6[2] = hc_byte_perm (w1[0], w1[1], selector); + w6[1] = hc_byte_perm (w0[3], w1[0], selector); + w6[0] = hc_byte_perm (w0[2], w0[3], selector); + w5[3] = hc_byte_perm (w0[1], w0[2], selector); + w5[2] = hc_byte_perm (w0[0], w0[1], selector); + w5[1] = hc_byte_perm ( 0, w0[0], selector); + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 22: + c5[2] = hc_byte_perm (w7[3], 0, selector); + c5[1] = hc_byte_perm (w7[2], w7[3], selector); + c5[0] = hc_byte_perm (w7[1], w7[2], selector); + c4[3] = hc_byte_perm (w7[0], w7[1], selector); + c4[2] = hc_byte_perm (w6[3], w7[0], selector); + c4[1] = hc_byte_perm (w6[2], w6[3], selector); + c4[0] = hc_byte_perm (w6[1], w6[2], selector); + c3[3] = hc_byte_perm (w6[0], w6[1], selector); + c3[2] = hc_byte_perm (w5[3], w6[0], selector); + c3[1] = hc_byte_perm (w5[2], w5[3], selector); + c3[0] = hc_byte_perm (w5[1], w5[2], selector); + c2[3] = hc_byte_perm (w5[0], w5[1], selector); + c2[2] = hc_byte_perm (w4[3], w5[0], selector); + c2[1] = hc_byte_perm (w4[2], w4[3], selector); + c2[0] = hc_byte_perm (w4[1], w4[2], selector); + c1[3] = hc_byte_perm (w4[0], w4[1], selector); + c1[2] = hc_byte_perm (w3[3], w4[0], selector); + c1[1] = hc_byte_perm (w3[2], w3[3], selector); + c1[0] = hc_byte_perm (w3[1], w3[2], selector); + c0[3] = hc_byte_perm (w3[0], w3[1], selector); + c0[2] = hc_byte_perm (w2[3], w3[0], selector); + c0[1] = hc_byte_perm (w2[2], w2[3], selector); + c0[0] = hc_byte_perm (w2[1], w2[2], selector); + w7[3] = hc_byte_perm (w2[0], w2[1], selector); + w7[2] = hc_byte_perm (w1[3], w2[0], selector); + w7[1] = hc_byte_perm (w1[2], w1[3], selector); + w7[0] = hc_byte_perm (w1[1], w1[2], selector); + w6[3] = hc_byte_perm (w1[0], w1[1], selector); + w6[2] = hc_byte_perm (w0[3], w1[0], selector); + w6[1] = hc_byte_perm (w0[2], w0[3], selector); + w6[0] = hc_byte_perm (w0[1], w0[2], selector); + w5[3] = hc_byte_perm (w0[0], w0[1], selector); + w5[2] = hc_byte_perm ( 0, w0[0], selector); + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 23: + c5[3] = hc_byte_perm (w7[3], 0, selector); + c5[2] = hc_byte_perm (w7[2], w7[3], selector); + c5[1] = hc_byte_perm (w7[1], w7[2], selector); + c5[0] = hc_byte_perm (w7[0], w7[1], selector); + c4[3] = hc_byte_perm (w6[3], w7[0], selector); + c4[2] = hc_byte_perm (w6[2], w6[3], selector); + c4[1] = hc_byte_perm (w6[1], w6[2], selector); + c4[0] = hc_byte_perm (w6[0], w6[1], selector); + c3[3] = hc_byte_perm (w5[3], w6[0], selector); + c3[2] = hc_byte_perm (w5[2], w5[3], selector); + c3[1] = hc_byte_perm (w5[1], w5[2], selector); + c3[0] = hc_byte_perm (w5[0], w5[1], selector); + c2[3] = hc_byte_perm (w4[3], w5[0], selector); + c2[2] = hc_byte_perm (w4[2], w4[3], selector); + c2[1] = hc_byte_perm (w4[1], w4[2], selector); + c2[0] = hc_byte_perm (w4[0], w4[1], selector); + c1[3] = hc_byte_perm (w3[3], w4[0], selector); + c1[2] = hc_byte_perm (w3[2], w3[3], selector); + c1[1] = hc_byte_perm (w3[1], w3[2], selector); + c1[0] = hc_byte_perm (w3[0], w3[1], selector); + c0[3] = hc_byte_perm (w2[3], w3[0], selector); + c0[2] = hc_byte_perm (w2[2], w2[3], selector); + c0[1] = hc_byte_perm (w2[1], w2[2], selector); + c0[0] = hc_byte_perm (w2[0], w2[1], selector); + w7[3] = hc_byte_perm (w1[3], w2[0], selector); + w7[2] = hc_byte_perm (w1[2], w1[3], selector); + w7[1] = hc_byte_perm (w1[1], w1[2], selector); + w7[0] = hc_byte_perm (w1[0], w1[1], selector); + w6[3] = hc_byte_perm (w0[3], w1[0], selector); + w6[2] = hc_byte_perm (w0[2], w0[3], selector); + w6[1] = hc_byte_perm (w0[1], w0[2], selector); + w6[0] = hc_byte_perm (w0[0], w0[1], selector); + w5[3] = hc_byte_perm ( 0, w0[0], selector); + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 24: + c6[0] = hc_byte_perm (w7[3], 0, selector); + c5[3] = hc_byte_perm (w7[2], w7[3], selector); + c5[2] = hc_byte_perm (w7[1], w7[2], selector); + c5[1] = hc_byte_perm (w7[0], w7[1], selector); + c5[0] = hc_byte_perm (w6[3], w7[0], selector); + c4[3] = hc_byte_perm (w6[2], w6[3], selector); + c4[2] = hc_byte_perm (w6[1], w6[2], selector); + c4[1] = hc_byte_perm (w6[0], w6[1], selector); + c4[0] = hc_byte_perm (w5[3], w6[0], selector); + c3[3] = hc_byte_perm (w5[2], w5[3], selector); + c3[2] = hc_byte_perm (w5[1], w5[2], selector); + c3[1] = hc_byte_perm (w5[0], w5[1], selector); + c3[0] = hc_byte_perm (w4[3], w5[0], selector); + c2[3] = hc_byte_perm (w4[2], w4[3], selector); + c2[2] = hc_byte_perm (w4[1], w4[2], selector); + c2[1] = hc_byte_perm (w4[0], w4[1], selector); + c2[0] = hc_byte_perm (w3[3], w4[0], selector); + c1[3] = hc_byte_perm (w3[2], w3[3], selector); + c1[2] = hc_byte_perm (w3[1], w3[2], selector); + c1[1] = hc_byte_perm (w3[0], w3[1], selector); + c1[0] = hc_byte_perm (w2[3], w3[0], selector); + c0[3] = hc_byte_perm (w2[2], w2[3], selector); + c0[2] = hc_byte_perm (w2[1], w2[2], selector); + c0[1] = hc_byte_perm (w2[0], w2[1], selector); + c0[0] = hc_byte_perm (w1[3], w2[0], selector); + w7[3] = hc_byte_perm (w1[2], w1[3], selector); + w7[2] = hc_byte_perm (w1[1], w1[2], selector); + w7[1] = hc_byte_perm (w1[0], w1[1], selector); + w7[0] = hc_byte_perm (w0[3], w1[0], selector); + w6[3] = hc_byte_perm (w0[2], w0[3], selector); + w6[2] = hc_byte_perm (w0[1], w0[2], selector); + w6[1] = hc_byte_perm (w0[0], w0[1], selector); + w6[0] = hc_byte_perm ( 0, w0[0], selector); + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 25: + c6[1] = hc_byte_perm (w7[3], 0, selector); + c6[0] = hc_byte_perm (w7[2], w7[3], selector); + c5[3] = hc_byte_perm (w7[1], w7[2], selector); + c5[2] = hc_byte_perm (w7[0], w7[1], selector); + c5[1] = hc_byte_perm (w6[3], w7[0], selector); + c5[0] = hc_byte_perm (w6[2], w6[3], selector); + c4[3] = hc_byte_perm (w6[1], w6[2], selector); + c4[2] = hc_byte_perm (w6[0], w6[1], selector); + c4[1] = hc_byte_perm (w5[3], w6[0], selector); + c4[0] = hc_byte_perm (w5[2], w5[3], selector); + c3[3] = hc_byte_perm (w5[1], w5[2], selector); + c3[2] = hc_byte_perm (w5[0], w5[1], selector); + c3[1] = hc_byte_perm (w4[3], w5[0], selector); + c3[0] = hc_byte_perm (w4[2], w4[3], selector); + c2[3] = hc_byte_perm (w4[1], w4[2], selector); + c2[2] = hc_byte_perm (w4[0], w4[1], selector); + c2[1] = hc_byte_perm (w3[3], w4[0], selector); + c2[0] = hc_byte_perm (w3[2], w3[3], selector); + c1[3] = hc_byte_perm (w3[1], w3[2], selector); + c1[2] = hc_byte_perm (w3[0], w3[1], selector); + c1[1] = hc_byte_perm (w2[3], w3[0], selector); + c1[0] = hc_byte_perm (w2[2], w2[3], selector); + c0[3] = hc_byte_perm (w2[1], w2[2], selector); + c0[2] = hc_byte_perm (w2[0], w2[1], selector); + c0[1] = hc_byte_perm (w1[3], w2[0], selector); + c0[0] = hc_byte_perm (w1[2], w1[3], selector); + w7[3] = hc_byte_perm (w1[1], w1[2], selector); + w7[2] = hc_byte_perm (w1[0], w1[1], selector); + w7[1] = hc_byte_perm (w0[3], w1[0], selector); + w7[0] = hc_byte_perm (w0[2], w0[3], selector); + w6[3] = hc_byte_perm (w0[1], w0[2], selector); + w6[2] = hc_byte_perm (w0[0], w0[1], selector); + w6[1] = hc_byte_perm ( 0, w0[0], selector); + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 26: + c6[2] = hc_byte_perm (w7[3], 0, selector); + c6[1] = hc_byte_perm (w7[2], w7[3], selector); + c6[0] = hc_byte_perm (w7[1], w7[2], selector); + c5[3] = hc_byte_perm (w7[0], w7[1], selector); + c5[2] = hc_byte_perm (w6[3], w7[0], selector); + c5[1] = hc_byte_perm (w6[2], w6[3], selector); + c5[0] = hc_byte_perm (w6[1], w6[2], selector); + c4[3] = hc_byte_perm (w6[0], w6[1], selector); + c4[2] = hc_byte_perm (w5[3], w6[0], selector); + c4[1] = hc_byte_perm (w5[2], w5[3], selector); + c4[0] = hc_byte_perm (w5[1], w5[2], selector); + c3[3] = hc_byte_perm (w5[0], w5[1], selector); + c3[2] = hc_byte_perm (w4[3], w5[0], selector); + c3[1] = hc_byte_perm (w4[2], w4[3], selector); + c3[0] = hc_byte_perm (w4[1], w4[2], selector); + c2[3] = hc_byte_perm (w4[0], w4[1], selector); + c2[2] = hc_byte_perm (w3[3], w4[0], selector); + c2[1] = hc_byte_perm (w3[2], w3[3], selector); + c2[0] = hc_byte_perm (w3[1], w3[2], selector); + c1[3] = hc_byte_perm (w3[0], w3[1], selector); + c1[2] = hc_byte_perm (w2[3], w3[0], selector); + c1[1] = hc_byte_perm (w2[2], w2[3], selector); + c1[0] = hc_byte_perm (w2[1], w2[2], selector); + c0[3] = hc_byte_perm (w2[0], w2[1], selector); + c0[2] = hc_byte_perm (w1[3], w2[0], selector); + c0[1] = hc_byte_perm (w1[2], w1[3], selector); + c0[0] = hc_byte_perm (w1[1], w1[2], selector); + w7[3] = hc_byte_perm (w1[0], w1[1], selector); + w7[2] = hc_byte_perm (w0[3], w1[0], selector); + w7[1] = hc_byte_perm (w0[2], w0[3], selector); + w7[0] = hc_byte_perm (w0[1], w0[2], selector); + w6[3] = hc_byte_perm (w0[0], w0[1], selector); + w6[2] = hc_byte_perm ( 0, w0[0], selector); + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 27: + c6[3] = hc_byte_perm (w7[3], 0, selector); + c6[2] = hc_byte_perm (w7[2], w7[3], selector); + c6[1] = hc_byte_perm (w7[1], w7[2], selector); + c6[0] = hc_byte_perm (w7[0], w7[1], selector); + c5[3] = hc_byte_perm (w6[3], w7[0], selector); + c5[2] = hc_byte_perm (w6[2], w6[3], selector); + c5[1] = hc_byte_perm (w6[1], w6[2], selector); + c5[0] = hc_byte_perm (w6[0], w6[1], selector); + c4[3] = hc_byte_perm (w5[3], w6[0], selector); + c4[2] = hc_byte_perm (w5[2], w5[3], selector); + c4[1] = hc_byte_perm (w5[1], w5[2], selector); + c4[0] = hc_byte_perm (w5[0], w5[1], selector); + c3[3] = hc_byte_perm (w4[3], w5[0], selector); + c3[2] = hc_byte_perm (w4[2], w4[3], selector); + c3[1] = hc_byte_perm (w4[1], w4[2], selector); + c3[0] = hc_byte_perm (w4[0], w4[1], selector); + c2[3] = hc_byte_perm (w3[3], w4[0], selector); + c2[2] = hc_byte_perm (w3[2], w3[3], selector); + c2[1] = hc_byte_perm (w3[1], w3[2], selector); + c2[0] = hc_byte_perm (w3[0], w3[1], selector); + c1[3] = hc_byte_perm (w2[3], w3[0], selector); + c1[2] = hc_byte_perm (w2[2], w2[3], selector); + c1[1] = hc_byte_perm (w2[1], w2[2], selector); + c1[0] = hc_byte_perm (w2[0], w2[1], selector); + c0[3] = hc_byte_perm (w1[3], w2[0], selector); + c0[2] = hc_byte_perm (w1[2], w1[3], selector); + c0[1] = hc_byte_perm (w1[1], w1[2], selector); + c0[0] = hc_byte_perm (w1[0], w1[1], selector); + w7[3] = hc_byte_perm (w0[3], w1[0], selector); + w7[2] = hc_byte_perm (w0[2], w0[3], selector); + w7[1] = hc_byte_perm (w0[1], w0[2], selector); + w7[0] = hc_byte_perm (w0[0], w0[1], selector); + w6[3] = hc_byte_perm ( 0, w0[0], selector); + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 28: + c7[0] = hc_byte_perm (w7[3], 0, selector); + c6[3] = hc_byte_perm (w7[2], w7[3], selector); + c6[2] = hc_byte_perm (w7[1], w7[2], selector); + c6[1] = hc_byte_perm (w7[0], w7[1], selector); + c6[0] = hc_byte_perm (w6[3], w7[0], selector); + c5[3] = hc_byte_perm (w6[2], w6[3], selector); + c5[2] = hc_byte_perm (w6[1], w6[2], selector); + c5[1] = hc_byte_perm (w6[0], w6[1], selector); + c5[0] = hc_byte_perm (w5[3], w6[0], selector); + c4[3] = hc_byte_perm (w5[2], w5[3], selector); + c4[2] = hc_byte_perm (w5[1], w5[2], selector); + c4[1] = hc_byte_perm (w5[0], w5[1], selector); + c4[0] = hc_byte_perm (w4[3], w5[0], selector); + c3[3] = hc_byte_perm (w4[2], w4[3], selector); + c3[2] = hc_byte_perm (w4[1], w4[2], selector); + c3[1] = hc_byte_perm (w4[0], w4[1], selector); + c3[0] = hc_byte_perm (w3[3], w4[0], selector); + c2[3] = hc_byte_perm (w3[2], w3[3], selector); + c2[2] = hc_byte_perm (w3[1], w3[2], selector); + c2[1] = hc_byte_perm (w3[0], w3[1], selector); + c2[0] = hc_byte_perm (w2[3], w3[0], selector); + c1[3] = hc_byte_perm (w2[2], w2[3], selector); + c1[2] = hc_byte_perm (w2[1], w2[2], selector); + c1[1] = hc_byte_perm (w2[0], w2[1], selector); + c1[0] = hc_byte_perm (w1[3], w2[0], selector); + c0[3] = hc_byte_perm (w1[2], w1[3], selector); + c0[2] = hc_byte_perm (w1[1], w1[2], selector); + c0[1] = hc_byte_perm (w1[0], w1[1], selector); + c0[0] = hc_byte_perm (w0[3], w1[0], selector); + w7[3] = hc_byte_perm (w0[2], w0[3], selector); + w7[2] = hc_byte_perm (w0[1], w0[2], selector); + w7[1] = hc_byte_perm (w0[0], w0[1], selector); + w7[0] = hc_byte_perm ( 0, w0[0], selector); + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 29: + c7[1] = hc_byte_perm (w7[3], 0, selector); + c7[0] = hc_byte_perm (w7[2], w7[3], selector); + c6[3] = hc_byte_perm (w7[1], w7[2], selector); + c6[2] = hc_byte_perm (w7[0], w7[1], selector); + c6[1] = hc_byte_perm (w6[3], w7[0], selector); + c6[0] = hc_byte_perm (w6[2], w6[3], selector); + c5[3] = hc_byte_perm (w6[1], w6[2], selector); + c5[2] = hc_byte_perm (w6[0], w6[1], selector); + c5[1] = hc_byte_perm (w5[3], w6[0], selector); + c5[0] = hc_byte_perm (w5[2], w5[3], selector); + c4[3] = hc_byte_perm (w5[1], w5[2], selector); + c4[2] = hc_byte_perm (w5[0], w5[1], selector); + c4[1] = hc_byte_perm (w4[3], w5[0], selector); + c4[0] = hc_byte_perm (w4[2], w4[3], selector); + c3[3] = hc_byte_perm (w4[1], w4[2], selector); + c3[2] = hc_byte_perm (w4[0], w4[1], selector); + c3[1] = hc_byte_perm (w3[3], w4[0], selector); + c3[0] = hc_byte_perm (w3[2], w3[3], selector); + c2[3] = hc_byte_perm (w3[1], w3[2], selector); + c2[2] = hc_byte_perm (w3[0], w3[1], selector); + c2[1] = hc_byte_perm (w2[3], w3[0], selector); + c2[0] = hc_byte_perm (w2[2], w2[3], selector); + c1[3] = hc_byte_perm (w2[1], w2[2], selector); + c1[2] = hc_byte_perm (w2[0], w2[1], selector); + c1[1] = hc_byte_perm (w1[3], w2[0], selector); + c1[0] = hc_byte_perm (w1[2], w1[3], selector); + c0[3] = hc_byte_perm (w1[1], w1[2], selector); + c0[2] = hc_byte_perm (w1[0], w1[1], selector); + c0[1] = hc_byte_perm (w0[3], w1[0], selector); + c0[0] = hc_byte_perm (w0[2], w0[3], selector); + w7[3] = hc_byte_perm (w0[1], w0[2], selector); + w7[2] = hc_byte_perm (w0[0], w0[1], selector); + w7[1] = hc_byte_perm ( 0, w0[0], selector); + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 30: + c7[2] = hc_byte_perm (w7[3], 0, selector); + c7[1] = hc_byte_perm (w7[2], w7[3], selector); + c7[0] = hc_byte_perm (w7[1], w7[2], selector); + c6[3] = hc_byte_perm (w7[0], w7[1], selector); + c6[2] = hc_byte_perm (w6[3], w7[0], selector); + c6[1] = hc_byte_perm (w6[2], w6[3], selector); + c6[0] = hc_byte_perm (w6[1], w6[2], selector); + c5[3] = hc_byte_perm (w6[0], w6[1], selector); + c5[2] = hc_byte_perm (w5[3], w6[0], selector); + c5[1] = hc_byte_perm (w5[2], w5[3], selector); + c5[0] = hc_byte_perm (w5[1], w5[2], selector); + c4[3] = hc_byte_perm (w5[0], w5[1], selector); + c4[2] = hc_byte_perm (w4[3], w5[0], selector); + c4[1] = hc_byte_perm (w4[2], w4[3], selector); + c4[0] = hc_byte_perm (w4[1], w4[2], selector); + c3[3] = hc_byte_perm (w4[0], w4[1], selector); + c3[2] = hc_byte_perm (w3[3], w4[0], selector); + c3[1] = hc_byte_perm (w3[2], w3[3], selector); + c3[0] = hc_byte_perm (w3[1], w3[2], selector); + c2[3] = hc_byte_perm (w3[0], w3[1], selector); + c2[2] = hc_byte_perm (w2[3], w3[0], selector); + c2[1] = hc_byte_perm (w2[2], w2[3], selector); + c2[0] = hc_byte_perm (w2[1], w2[2], selector); + c1[3] = hc_byte_perm (w2[0], w2[1], selector); + c1[2] = hc_byte_perm (w1[3], w2[0], selector); + c1[1] = hc_byte_perm (w1[2], w1[3], selector); + c1[0] = hc_byte_perm (w1[1], w1[2], selector); + c0[3] = hc_byte_perm (w1[0], w1[1], selector); + c0[2] = hc_byte_perm (w0[3], w1[0], selector); + c0[1] = hc_byte_perm (w0[2], w0[3], selector); + c0[0] = hc_byte_perm (w0[1], w0[2], selector); + w7[3] = hc_byte_perm (w0[0], w0[1], selector); + w7[2] = hc_byte_perm ( 0, w0[0], selector); + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 31: + c7[3] = hc_byte_perm (w7[3], 0, selector); + c7[2] = hc_byte_perm (w7[2], w7[3], selector); + c7[1] = hc_byte_perm (w7[1], w7[2], selector); + c7[0] = hc_byte_perm (w7[0], w7[1], selector); + c6[3] = hc_byte_perm (w6[3], w7[0], selector); + c6[2] = hc_byte_perm (w6[2], w6[3], selector); + c6[1] = hc_byte_perm (w6[1], w6[2], selector); + c6[0] = hc_byte_perm (w6[0], w6[1], selector); + c5[3] = hc_byte_perm (w5[3], w6[0], selector); + c5[2] = hc_byte_perm (w5[2], w5[3], selector); + c5[1] = hc_byte_perm (w5[1], w5[2], selector); + c5[0] = hc_byte_perm (w5[0], w5[1], selector); + c4[3] = hc_byte_perm (w4[3], w5[0], selector); + c4[2] = hc_byte_perm (w4[2], w4[3], selector); + c4[1] = hc_byte_perm (w4[1], w4[2], selector); + c4[0] = hc_byte_perm (w4[0], w4[1], selector); + c3[3] = hc_byte_perm (w3[3], w4[0], selector); + c3[2] = hc_byte_perm (w3[2], w3[3], selector); + c3[1] = hc_byte_perm (w3[1], w3[2], selector); + c3[0] = hc_byte_perm (w3[0], w3[1], selector); + c2[3] = hc_byte_perm (w2[3], w3[0], selector); + c2[2] = hc_byte_perm (w2[2], w2[3], selector); + c2[1] = hc_byte_perm (w2[1], w2[2], selector); + c2[0] = hc_byte_perm (w2[0], w2[1], selector); + c1[3] = hc_byte_perm (w1[3], w2[0], selector); + c1[2] = hc_byte_perm (w1[2], w1[3], selector); + c1[1] = hc_byte_perm (w1[1], w1[2], selector); + c1[0] = hc_byte_perm (w1[0], w1[1], selector); + c0[3] = hc_byte_perm (w0[3], w1[0], selector); + c0[2] = hc_byte_perm (w0[2], w0[3], selector); + c0[1] = hc_byte_perm (w0[1], w0[2], selector); + c0[0] = hc_byte_perm (w0[0], w0[1], selector); + w7[3] = hc_byte_perm ( 0, w0[0], selector); + w7[2] = 0; + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + } + #endif +} + DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *w4, u32x *w5, u32x *w6, u32x *w7, const u32 offset) { const int offset_switch = offset / 4; @@ -37702,6 +41090,3394 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 * #endif } +DECLSPEC void switch_buffer_by_offset_8x4_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *w4, u32 *w5, u32 *w6, u32 *w7, u32 *c0, u32 *c1, u32 *c2, u32 *c3, u32 *c4, u32 *c5, u32 *c6, u32 *c7, const u32 offset) +{ + const int offset_switch = offset / 4; + + #if (defined IS_AMD && HAS_VPERM == 0) || defined IS_GENERIC + switch (offset_switch) + { + case 0: + c0[0] = hc_bytealign_S (w7[3], 0, offset); + w7[3] = hc_bytealign_S (w7[2], w7[3], offset); + w7[2] = hc_bytealign_S (w7[1], w7[2], offset); + w7[1] = hc_bytealign_S (w7[0], w7[1], offset); + w7[0] = hc_bytealign_S (w6[3], w7[0], offset); + w6[3] = hc_bytealign_S (w6[2], w6[3], offset); + w6[2] = hc_bytealign_S (w6[1], w6[2], offset); + w6[1] = hc_bytealign_S (w6[0], w6[1], offset); + w6[0] = hc_bytealign_S (w5[3], w6[0], offset); + w5[3] = hc_bytealign_S (w5[2], w5[3], offset); + w5[2] = hc_bytealign_S (w5[1], w5[2], offset); + w5[1] = hc_bytealign_S (w5[0], w5[1], offset); + w5[0] = hc_bytealign_S (w4[3], w5[0], offset); + w4[3] = hc_bytealign_S (w4[2], w4[3], offset); + w4[2] = hc_bytealign_S (w4[1], w4[2], offset); + w4[1] = hc_bytealign_S (w4[0], w4[1], offset); + w4[0] = hc_bytealign_S (w3[3], w4[0], offset); + w3[3] = hc_bytealign_S (w3[2], w3[3], offset); + w3[2] = hc_bytealign_S (w3[1], w3[2], offset); + w3[1] = hc_bytealign_S (w3[0], w3[1], offset); + w3[0] = hc_bytealign_S (w2[3], w3[0], offset); + w2[3] = hc_bytealign_S (w2[2], w2[3], offset); + w2[2] = hc_bytealign_S (w2[1], w2[2], offset); + w2[1] = hc_bytealign_S (w2[0], w2[1], offset); + w2[0] = hc_bytealign_S (w1[3], w2[0], offset); + w1[3] = hc_bytealign_S (w1[2], w1[3], offset); + w1[2] = hc_bytealign_S (w1[1], w1[2], offset); + w1[1] = hc_bytealign_S (w1[0], w1[1], offset); + w1[0] = hc_bytealign_S (w0[3], w1[0], offset); + w0[3] = hc_bytealign_S (w0[2], w0[3], offset); + w0[2] = hc_bytealign_S (w0[1], w0[2], offset); + w0[1] = hc_bytealign_S (w0[0], w0[1], offset); + w0[0] = hc_bytealign_S ( 0, w0[0], offset); + + break; + + case 1: + c0[1] = hc_bytealign_S (w7[3], 0, offset); + c0[0] = hc_bytealign_S (w7[2], w7[3], offset); + w7[3] = hc_bytealign_S (w7[1], w7[2], offset); + w7[2] = hc_bytealign_S (w7[0], w7[1], offset); + w7[1] = hc_bytealign_S (w6[3], w7[0], offset); + w7[0] = hc_bytealign_S (w6[2], w6[3], offset); + w6[3] = hc_bytealign_S (w6[1], w6[2], offset); + w6[2] = hc_bytealign_S (w6[0], w6[1], offset); + w6[1] = hc_bytealign_S (w5[3], w6[0], offset); + w6[0] = hc_bytealign_S (w5[2], w5[3], offset); + w5[3] = hc_bytealign_S (w5[1], w5[2], offset); + w5[2] = hc_bytealign_S (w5[0], w5[1], offset); + w5[1] = hc_bytealign_S (w4[3], w5[0], offset); + w5[0] = hc_bytealign_S (w4[2], w4[3], offset); + w4[3] = hc_bytealign_S (w4[1], w4[2], offset); + w4[2] = hc_bytealign_S (w4[0], w4[1], offset); + w4[1] = hc_bytealign_S (w3[3], w4[0], offset); + w4[0] = hc_bytealign_S (w3[2], w3[3], offset); + w3[3] = hc_bytealign_S (w3[1], w3[2], offset); + w3[2] = hc_bytealign_S (w3[0], w3[1], offset); + w3[1] = hc_bytealign_S (w2[3], w3[0], offset); + w3[0] = hc_bytealign_S (w2[2], w2[3], offset); + w2[3] = hc_bytealign_S (w2[1], w2[2], offset); + w2[2] = hc_bytealign_S (w2[0], w2[1], offset); + w2[1] = hc_bytealign_S (w1[3], w2[0], offset); + w2[0] = hc_bytealign_S (w1[2], w1[3], offset); + w1[3] = hc_bytealign_S (w1[1], w1[2], offset); + w1[2] = hc_bytealign_S (w1[0], w1[1], offset); + w1[1] = hc_bytealign_S (w0[3], w1[0], offset); + w1[0] = hc_bytealign_S (w0[2], w0[3], offset); + w0[3] = hc_bytealign_S (w0[1], w0[2], offset); + w0[2] = hc_bytealign_S (w0[0], w0[1], offset); + w0[1] = hc_bytealign_S ( 0, w0[0], offset); + w0[0] = 0; + + break; + + case 2: + c0[2] = hc_bytealign_S (w7[3], 0, offset); + c0[1] = hc_bytealign_S (w7[2], w7[3], offset); + c0[0] = hc_bytealign_S (w7[1], w7[2], offset); + w7[3] = hc_bytealign_S (w7[0], w7[1], offset); + w7[2] = hc_bytealign_S (w6[3], w7[0], offset); + w7[1] = hc_bytealign_S (w6[2], w6[3], offset); + w7[0] = hc_bytealign_S (w6[1], w6[2], offset); + w6[3] = hc_bytealign_S (w6[0], w6[1], offset); + w6[2] = hc_bytealign_S (w5[3], w6[0], offset); + w6[1] = hc_bytealign_S (w5[2], w5[3], offset); + w6[0] = hc_bytealign_S (w5[1], w5[2], offset); + w5[3] = hc_bytealign_S (w5[0], w5[1], offset); + w5[2] = hc_bytealign_S (w4[3], w5[0], offset); + w5[1] = hc_bytealign_S (w4[2], w4[3], offset); + w5[0] = hc_bytealign_S (w4[1], w4[2], offset); + w4[3] = hc_bytealign_S (w4[0], w4[1], offset); + w4[2] = hc_bytealign_S (w3[3], w4[0], offset); + w4[1] = hc_bytealign_S (w3[2], w3[3], offset); + w4[0] = hc_bytealign_S (w3[1], w3[2], offset); + w3[3] = hc_bytealign_S (w3[0], w3[1], offset); + w3[2] = hc_bytealign_S (w2[3], w3[0], offset); + w3[1] = hc_bytealign_S (w2[2], w2[3], offset); + w3[0] = hc_bytealign_S (w2[1], w2[2], offset); + w2[3] = hc_bytealign_S (w2[0], w2[1], offset); + w2[2] = hc_bytealign_S (w1[3], w2[0], offset); + w2[1] = hc_bytealign_S (w1[2], w1[3], offset); + w2[0] = hc_bytealign_S (w1[1], w1[2], offset); + w1[3] = hc_bytealign_S (w1[0], w1[1], offset); + w1[2] = hc_bytealign_S (w0[3], w1[0], offset); + w1[1] = hc_bytealign_S (w0[2], w0[3], offset); + w1[0] = hc_bytealign_S (w0[1], w0[2], offset); + w0[3] = hc_bytealign_S (w0[0], w0[1], offset); + w0[2] = hc_bytealign_S ( 0, w0[0], offset); + w0[1] = 0; + w0[0] = 0; + + break; + + case 3: + c0[3] = hc_bytealign_S (w7[3], 0, offset); + c0[2] = hc_bytealign_S (w7[2], w7[3], offset); + c0[1] = hc_bytealign_S (w7[1], w7[2], offset); + c0[0] = hc_bytealign_S (w7[0], w7[1], offset); + w7[3] = hc_bytealign_S (w6[3], w7[0], offset); + w7[2] = hc_bytealign_S (w6[2], w6[3], offset); + w7[1] = hc_bytealign_S (w6[1], w6[2], offset); + w7[0] = hc_bytealign_S (w6[0], w6[1], offset); + w6[3] = hc_bytealign_S (w5[3], w6[0], offset); + w6[2] = hc_bytealign_S (w5[2], w5[3], offset); + w6[1] = hc_bytealign_S (w5[1], w5[2], offset); + w6[0] = hc_bytealign_S (w5[0], w5[1], offset); + w5[3] = hc_bytealign_S (w4[3], w5[0], offset); + w5[2] = hc_bytealign_S (w4[2], w4[3], offset); + w5[1] = hc_bytealign_S (w4[1], w4[2], offset); + w5[0] = hc_bytealign_S (w4[0], w4[1], offset); + w4[3] = hc_bytealign_S (w3[3], w4[0], offset); + w4[2] = hc_bytealign_S (w3[2], w3[3], offset); + w4[1] = hc_bytealign_S (w3[1], w3[2], offset); + w4[0] = hc_bytealign_S (w3[0], w3[1], offset); + w3[3] = hc_bytealign_S (w2[3], w3[0], offset); + w3[2] = hc_bytealign_S (w2[2], w2[3], offset); + w3[1] = hc_bytealign_S (w2[1], w2[2], offset); + w3[0] = hc_bytealign_S (w2[0], w2[1], offset); + w2[3] = hc_bytealign_S (w1[3], w2[0], offset); + w2[2] = hc_bytealign_S (w1[2], w1[3], offset); + w2[1] = hc_bytealign_S (w1[1], w1[2], offset); + w2[0] = hc_bytealign_S (w1[0], w1[1], offset); + w1[3] = hc_bytealign_S (w0[3], w1[0], offset); + w1[2] = hc_bytealign_S (w0[2], w0[3], offset); + w1[1] = hc_bytealign_S (w0[1], w0[2], offset); + w1[0] = hc_bytealign_S (w0[0], w0[1], offset); + w0[3] = hc_bytealign_S ( 0, w0[0], offset); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 4: + c1[0] = hc_bytealign_S (w7[3], 0, offset); + c0[3] = hc_bytealign_S (w7[2], w7[3], offset); + c0[2] = hc_bytealign_S (w7[1], w7[2], offset); + c0[1] = hc_bytealign_S (w7[0], w7[1], offset); + c0[0] = hc_bytealign_S (w6[3], w7[0], offset); + w7[3] = hc_bytealign_S (w6[2], w6[3], offset); + w7[2] = hc_bytealign_S (w6[1], w6[2], offset); + w7[1] = hc_bytealign_S (w6[0], w6[1], offset); + w7[0] = hc_bytealign_S (w5[3], w6[0], offset); + w6[3] = hc_bytealign_S (w5[2], w5[3], offset); + w6[2] = hc_bytealign_S (w5[1], w5[2], offset); + w6[1] = hc_bytealign_S (w5[0], w5[1], offset); + w6[0] = hc_bytealign_S (w4[3], w5[0], offset); + w5[3] = hc_bytealign_S (w4[2], w4[3], offset); + w5[2] = hc_bytealign_S (w4[1], w4[2], offset); + w5[1] = hc_bytealign_S (w4[0], w4[1], offset); + w5[0] = hc_bytealign_S (w3[3], w4[0], offset); + w4[3] = hc_bytealign_S (w3[2], w3[3], offset); + w4[2] = hc_bytealign_S (w3[1], w3[2], offset); + w4[1] = hc_bytealign_S (w3[0], w3[1], offset); + w4[0] = hc_bytealign_S (w2[3], w3[0], offset); + w3[3] = hc_bytealign_S (w2[2], w2[3], offset); + w3[2] = hc_bytealign_S (w2[1], w2[2], offset); + w3[1] = hc_bytealign_S (w2[0], w2[1], offset); + w3[0] = hc_bytealign_S (w1[3], w2[0], offset); + w2[3] = hc_bytealign_S (w1[2], w1[3], offset); + w2[2] = hc_bytealign_S (w1[1], w1[2], offset); + w2[1] = hc_bytealign_S (w1[0], w1[1], offset); + w2[0] = hc_bytealign_S (w0[3], w1[0], offset); + w1[3] = hc_bytealign_S (w0[2], w0[3], offset); + w1[2] = hc_bytealign_S (w0[1], w0[2], offset); + w1[1] = hc_bytealign_S (w0[0], w0[1], offset); + w1[0] = hc_bytealign_S ( 0, w0[0], offset); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 5: + c1[1] = hc_bytealign_S (w7[3], 0, offset); + c1[0] = hc_bytealign_S (w7[2], w7[3], offset); + c0[3] = hc_bytealign_S (w7[1], w7[2], offset); + c0[2] = hc_bytealign_S (w7[0], w7[1], offset); + c0[1] = hc_bytealign_S (w6[3], w7[0], offset); + c0[0] = hc_bytealign_S (w6[2], w6[3], offset); + w7[3] = hc_bytealign_S (w6[1], w6[2], offset); + w7[2] = hc_bytealign_S (w6[0], w6[1], offset); + w7[1] = hc_bytealign_S (w5[3], w6[0], offset); + w7[0] = hc_bytealign_S (w5[2], w5[3], offset); + w6[3] = hc_bytealign_S (w5[1], w5[2], offset); + w6[2] = hc_bytealign_S (w5[0], w5[1], offset); + w6[1] = hc_bytealign_S (w4[3], w5[0], offset); + w6[0] = hc_bytealign_S (w4[2], w4[3], offset); + w5[3] = hc_bytealign_S (w4[1], w4[2], offset); + w5[2] = hc_bytealign_S (w4[0], w4[1], offset); + w5[1] = hc_bytealign_S (w3[3], w4[0], offset); + w5[0] = hc_bytealign_S (w3[2], w3[3], offset); + w4[3] = hc_bytealign_S (w3[1], w3[2], offset); + w4[2] = hc_bytealign_S (w3[0], w3[1], offset); + w4[1] = hc_bytealign_S (w2[3], w3[0], offset); + w4[0] = hc_bytealign_S (w2[2], w2[3], offset); + w3[3] = hc_bytealign_S (w2[1], w2[2], offset); + w3[2] = hc_bytealign_S (w2[0], w2[1], offset); + w3[1] = hc_bytealign_S (w1[3], w2[0], offset); + w3[0] = hc_bytealign_S (w1[2], w1[3], offset); + w2[3] = hc_bytealign_S (w1[1], w1[2], offset); + w2[2] = hc_bytealign_S (w1[0], w1[1], offset); + w2[1] = hc_bytealign_S (w0[3], w1[0], offset); + w2[0] = hc_bytealign_S (w0[2], w0[3], offset); + w1[3] = hc_bytealign_S (w0[1], w0[2], offset); + w1[2] = hc_bytealign_S (w0[0], w0[1], offset); + w1[1] = hc_bytealign_S ( 0, w0[0], offset); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 6: + c1[2] = hc_bytealign_S (w7[3], 0, offset); + c1[1] = hc_bytealign_S (w7[2], w7[3], offset); + c1[0] = hc_bytealign_S (w7[1], w7[2], offset); + c0[3] = hc_bytealign_S (w7[0], w7[1], offset); + c0[2] = hc_bytealign_S (w6[3], w7[0], offset); + c0[1] = hc_bytealign_S (w6[2], w6[3], offset); + c0[0] = hc_bytealign_S (w6[1], w6[2], offset); + w7[3] = hc_bytealign_S (w6[0], w6[1], offset); + w7[2] = hc_bytealign_S (w5[3], w6[0], offset); + w7[1] = hc_bytealign_S (w5[2], w5[3], offset); + w7[0] = hc_bytealign_S (w5[1], w5[2], offset); + w6[3] = hc_bytealign_S (w5[0], w5[1], offset); + w6[2] = hc_bytealign_S (w4[3], w5[0], offset); + w6[1] = hc_bytealign_S (w4[2], w4[3], offset); + w6[0] = hc_bytealign_S (w4[1], w4[2], offset); + w5[3] = hc_bytealign_S (w4[0], w4[1], offset); + w5[2] = hc_bytealign_S (w3[3], w4[0], offset); + w5[1] = hc_bytealign_S (w3[2], w3[3], offset); + w5[0] = hc_bytealign_S (w3[1], w3[2], offset); + w4[3] = hc_bytealign_S (w3[0], w3[1], offset); + w4[2] = hc_bytealign_S (w2[3], w3[0], offset); + w4[1] = hc_bytealign_S (w2[2], w2[3], offset); + w4[0] = hc_bytealign_S (w2[1], w2[2], offset); + w3[3] = hc_bytealign_S (w2[0], w2[1], offset); + w3[2] = hc_bytealign_S (w1[3], w2[0], offset); + w3[1] = hc_bytealign_S (w1[2], w1[3], offset); + w3[0] = hc_bytealign_S (w1[1], w1[2], offset); + w2[3] = hc_bytealign_S (w1[0], w1[1], offset); + w2[2] = hc_bytealign_S (w0[3], w1[0], offset); + w2[1] = hc_bytealign_S (w0[2], w0[3], offset); + w2[0] = hc_bytealign_S (w0[1], w0[2], offset); + w1[3] = hc_bytealign_S (w0[0], w0[1], offset); + w1[2] = hc_bytealign_S ( 0, w0[0], offset); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 7: + c1[3] = hc_bytealign_S (w7[3], 0, offset); + c1[2] = hc_bytealign_S (w7[2], w7[3], offset); + c1[1] = hc_bytealign_S (w7[1], w7[2], offset); + c1[0] = hc_bytealign_S (w7[0], w7[1], offset); + c0[3] = hc_bytealign_S (w6[3], w7[0], offset); + c0[2] = hc_bytealign_S (w6[2], w6[3], offset); + c0[1] = hc_bytealign_S (w6[1], w6[2], offset); + c0[0] = hc_bytealign_S (w6[0], w6[1], offset); + w7[3] = hc_bytealign_S (w5[3], w6[0], offset); + w7[2] = hc_bytealign_S (w5[2], w5[3], offset); + w7[1] = hc_bytealign_S (w5[1], w5[2], offset); + w7[0] = hc_bytealign_S (w5[0], w5[1], offset); + w6[3] = hc_bytealign_S (w4[3], w5[0], offset); + w6[2] = hc_bytealign_S (w4[2], w4[3], offset); + w6[1] = hc_bytealign_S (w4[1], w4[2], offset); + w6[0] = hc_bytealign_S (w4[0], w4[1], offset); + w5[3] = hc_bytealign_S (w3[3], w4[0], offset); + w5[2] = hc_bytealign_S (w3[2], w3[3], offset); + w5[1] = hc_bytealign_S (w3[1], w3[2], offset); + w5[0] = hc_bytealign_S (w3[0], w3[1], offset); + w4[3] = hc_bytealign_S (w2[3], w3[0], offset); + w4[2] = hc_bytealign_S (w2[2], w2[3], offset); + w4[1] = hc_bytealign_S (w2[1], w2[2], offset); + w4[0] = hc_bytealign_S (w2[0], w2[1], offset); + w3[3] = hc_bytealign_S (w1[3], w2[0], offset); + w3[2] = hc_bytealign_S (w1[2], w1[3], offset); + w3[1] = hc_bytealign_S (w1[1], w1[2], offset); + w3[0] = hc_bytealign_S (w1[0], w1[1], offset); + w2[3] = hc_bytealign_S (w0[3], w1[0], offset); + w2[2] = hc_bytealign_S (w0[2], w0[3], offset); + w2[1] = hc_bytealign_S (w0[1], w0[2], offset); + w2[0] = hc_bytealign_S (w0[0], w0[1], offset); + w1[3] = hc_bytealign_S ( 0, w0[0], offset); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 8: + c2[0] = hc_bytealign_S (w7[3], 0, offset); + c1[3] = hc_bytealign_S (w7[2], w7[3], offset); + c1[2] = hc_bytealign_S (w7[1], w7[2], offset); + c1[1] = hc_bytealign_S (w7[0], w7[1], offset); + c1[0] = hc_bytealign_S (w6[3], w7[0], offset); + c0[3] = hc_bytealign_S (w6[2], w6[3], offset); + c0[2] = hc_bytealign_S (w6[1], w6[2], offset); + c0[1] = hc_bytealign_S (w6[0], w6[1], offset); + c0[0] = hc_bytealign_S (w5[3], w6[0], offset); + w7[3] = hc_bytealign_S (w5[2], w5[3], offset); + w7[2] = hc_bytealign_S (w5[1], w5[2], offset); + w7[1] = hc_bytealign_S (w5[0], w5[1], offset); + w7[0] = hc_bytealign_S (w4[3], w5[0], offset); + w6[3] = hc_bytealign_S (w4[2], w4[3], offset); + w6[2] = hc_bytealign_S (w4[1], w4[2], offset); + w6[1] = hc_bytealign_S (w4[0], w4[1], offset); + w6[0] = hc_bytealign_S (w3[3], w4[0], offset); + w5[3] = hc_bytealign_S (w3[2], w3[3], offset); + w5[2] = hc_bytealign_S (w3[1], w3[2], offset); + w5[1] = hc_bytealign_S (w3[0], w3[1], offset); + w5[0] = hc_bytealign_S (w2[3], w3[0], offset); + w4[3] = hc_bytealign_S (w2[2], w2[3], offset); + w4[2] = hc_bytealign_S (w2[1], w2[2], offset); + w4[1] = hc_bytealign_S (w2[0], w2[1], offset); + w4[0] = hc_bytealign_S (w1[3], w2[0], offset); + w3[3] = hc_bytealign_S (w1[2], w1[3], offset); + w3[2] = hc_bytealign_S (w1[1], w1[2], offset); + w3[1] = hc_bytealign_S (w1[0], w1[1], offset); + w3[0] = hc_bytealign_S (w0[3], w1[0], offset); + w2[3] = hc_bytealign_S (w0[2], w0[3], offset); + w2[2] = hc_bytealign_S (w0[1], w0[2], offset); + w2[1] = hc_bytealign_S (w0[0], w0[1], offset); + w2[0] = hc_bytealign_S ( 0, w0[0], offset); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 9: + c2[1] = hc_bytealign_S (w7[3], 0, offset); + c2[0] = hc_bytealign_S (w7[2], w7[3], offset); + c1[3] = hc_bytealign_S (w7[1], w7[2], offset); + c1[2] = hc_bytealign_S (w7[0], w7[1], offset); + c1[1] = hc_bytealign_S (w6[3], w7[0], offset); + c1[0] = hc_bytealign_S (w6[2], w6[3], offset); + c0[3] = hc_bytealign_S (w6[1], w6[2], offset); + c0[2] = hc_bytealign_S (w6[0], w6[1], offset); + c0[1] = hc_bytealign_S (w5[3], w6[0], offset); + c0[0] = hc_bytealign_S (w5[2], w5[3], offset); + w7[3] = hc_bytealign_S (w5[1], w5[2], offset); + w7[2] = hc_bytealign_S (w5[0], w5[1], offset); + w7[1] = hc_bytealign_S (w4[3], w5[0], offset); + w7[0] = hc_bytealign_S (w4[2], w4[3], offset); + w6[3] = hc_bytealign_S (w4[1], w4[2], offset); + w6[2] = hc_bytealign_S (w4[0], w4[1], offset); + w6[1] = hc_bytealign_S (w3[3], w4[0], offset); + w6[0] = hc_bytealign_S (w3[2], w3[3], offset); + w5[3] = hc_bytealign_S (w3[1], w3[2], offset); + w5[2] = hc_bytealign_S (w3[0], w3[1], offset); + w5[1] = hc_bytealign_S (w2[3], w3[0], offset); + w5[0] = hc_bytealign_S (w2[2], w2[3], offset); + w4[3] = hc_bytealign_S (w2[1], w2[2], offset); + w4[2] = hc_bytealign_S (w2[0], w2[1], offset); + w4[1] = hc_bytealign_S (w1[3], w2[0], offset); + w4[0] = hc_bytealign_S (w1[2], w1[3], offset); + w3[3] = hc_bytealign_S (w1[1], w1[2], offset); + w3[2] = hc_bytealign_S (w1[0], w1[1], offset); + w3[1] = hc_bytealign_S (w0[3], w1[0], offset); + w3[0] = hc_bytealign_S (w0[2], w0[3], offset); + w2[3] = hc_bytealign_S (w0[1], w0[2], offset); + w2[2] = hc_bytealign_S (w0[0], w0[1], offset); + w2[1] = hc_bytealign_S ( 0, w0[0], offset); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 10: + c2[2] = hc_bytealign_S (w7[3], 0, offset); + c2[1] = hc_bytealign_S (w7[2], w7[3], offset); + c2[0] = hc_bytealign_S (w7[1], w7[2], offset); + c1[3] = hc_bytealign_S (w7[0], w7[1], offset); + c1[2] = hc_bytealign_S (w6[3], w7[0], offset); + c1[1] = hc_bytealign_S (w6[2], w6[3], offset); + c1[0] = hc_bytealign_S (w6[1], w6[2], offset); + c0[3] = hc_bytealign_S (w6[0], w6[1], offset); + c0[2] = hc_bytealign_S (w5[3], w6[0], offset); + c0[1] = hc_bytealign_S (w5[2], w5[3], offset); + c0[0] = hc_bytealign_S (w5[1], w5[2], offset); + w7[3] = hc_bytealign_S (w5[0], w5[1], offset); + w7[2] = hc_bytealign_S (w4[3], w5[0], offset); + w7[1] = hc_bytealign_S (w4[2], w4[3], offset); + w7[0] = hc_bytealign_S (w4[1], w4[2], offset); + w6[3] = hc_bytealign_S (w4[0], w4[1], offset); + w6[2] = hc_bytealign_S (w3[3], w4[0], offset); + w6[1] = hc_bytealign_S (w3[2], w3[3], offset); + w6[0] = hc_bytealign_S (w3[1], w3[2], offset); + w5[3] = hc_bytealign_S (w3[0], w3[1], offset); + w5[2] = hc_bytealign_S (w2[3], w3[0], offset); + w5[1] = hc_bytealign_S (w2[2], w2[3], offset); + w5[0] = hc_bytealign_S (w2[1], w2[2], offset); + w4[3] = hc_bytealign_S (w2[0], w2[1], offset); + w4[2] = hc_bytealign_S (w1[3], w2[0], offset); + w4[1] = hc_bytealign_S (w1[2], w1[3], offset); + w4[0] = hc_bytealign_S (w1[1], w1[2], offset); + w3[3] = hc_bytealign_S (w1[0], w1[1], offset); + w3[2] = hc_bytealign_S (w0[3], w1[0], offset); + w3[1] = hc_bytealign_S (w0[2], w0[3], offset); + w3[0] = hc_bytealign_S (w0[1], w0[2], offset); + w2[3] = hc_bytealign_S (w0[0], w0[1], offset); + w2[2] = hc_bytealign_S ( 0, w0[0], offset); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 11: + c2[3] = hc_bytealign_S (w7[3], 0, offset); + c2[2] = hc_bytealign_S (w7[2], w7[3], offset); + c2[1] = hc_bytealign_S (w7[1], w7[2], offset); + c2[0] = hc_bytealign_S (w7[0], w7[1], offset); + c1[3] = hc_bytealign_S (w6[3], w7[0], offset); + c1[2] = hc_bytealign_S (w6[2], w6[3], offset); + c1[1] = hc_bytealign_S (w6[1], w6[2], offset); + c1[0] = hc_bytealign_S (w6[0], w6[1], offset); + c0[3] = hc_bytealign_S (w5[3], w6[0], offset); + c0[2] = hc_bytealign_S (w5[2], w5[3], offset); + c0[1] = hc_bytealign_S (w5[1], w5[2], offset); + c0[0] = hc_bytealign_S (w5[0], w5[1], offset); + w7[3] = hc_bytealign_S (w4[3], w5[0], offset); + w7[2] = hc_bytealign_S (w4[2], w4[3], offset); + w7[1] = hc_bytealign_S (w4[1], w4[2], offset); + w7[0] = hc_bytealign_S (w4[0], w4[1], offset); + w6[3] = hc_bytealign_S (w3[3], w4[0], offset); + w6[2] = hc_bytealign_S (w3[2], w3[3], offset); + w6[1] = hc_bytealign_S (w3[1], w3[2], offset); + w6[0] = hc_bytealign_S (w3[0], w3[1], offset); + w5[3] = hc_bytealign_S (w2[3], w3[0], offset); + w5[2] = hc_bytealign_S (w2[2], w2[3], offset); + w5[1] = hc_bytealign_S (w2[1], w2[2], offset); + w5[0] = hc_bytealign_S (w2[0], w2[1], offset); + w4[3] = hc_bytealign_S (w1[3], w2[0], offset); + w4[2] = hc_bytealign_S (w1[2], w1[3], offset); + w4[1] = hc_bytealign_S (w1[1], w1[2], offset); + w4[0] = hc_bytealign_S (w1[0], w1[1], offset); + w3[3] = hc_bytealign_S (w0[3], w1[0], offset); + w3[2] = hc_bytealign_S (w0[2], w0[3], offset); + w3[1] = hc_bytealign_S (w0[1], w0[2], offset); + w3[0] = hc_bytealign_S (w0[0], w0[1], offset); + w2[3] = hc_bytealign_S ( 0, w0[0], offset); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 12: + c3[0] = hc_bytealign_S (w7[3], 0, offset); + c2[3] = hc_bytealign_S (w7[2], w7[3], offset); + c2[2] = hc_bytealign_S (w7[1], w7[2], offset); + c2[1] = hc_bytealign_S (w7[0], w7[1], offset); + c2[0] = hc_bytealign_S (w6[3], w7[0], offset); + c1[3] = hc_bytealign_S (w6[2], w6[3], offset); + c1[2] = hc_bytealign_S (w6[1], w6[2], offset); + c1[1] = hc_bytealign_S (w6[0], w6[1], offset); + c1[0] = hc_bytealign_S (w5[3], w6[0], offset); + c0[3] = hc_bytealign_S (w5[2], w5[3], offset); + c0[2] = hc_bytealign_S (w5[1], w5[2], offset); + c0[1] = hc_bytealign_S (w5[0], w5[1], offset); + c0[0] = hc_bytealign_S (w4[3], w5[0], offset); + w7[3] = hc_bytealign_S (w4[2], w4[3], offset); + w7[2] = hc_bytealign_S (w4[1], w4[2], offset); + w7[1] = hc_bytealign_S (w4[0], w4[1], offset); + w7[0] = hc_bytealign_S (w3[3], w4[0], offset); + w6[3] = hc_bytealign_S (w3[2], w3[3], offset); + w6[2] = hc_bytealign_S (w3[1], w3[2], offset); + w6[1] = hc_bytealign_S (w3[0], w3[1], offset); + w6[0] = hc_bytealign_S (w2[3], w3[0], offset); + w5[3] = hc_bytealign_S (w2[2], w2[3], offset); + w5[2] = hc_bytealign_S (w2[1], w2[2], offset); + w5[1] = hc_bytealign_S (w2[0], w2[1], offset); + w5[0] = hc_bytealign_S (w1[3], w2[0], offset); + w4[3] = hc_bytealign_S (w1[2], w1[3], offset); + w4[2] = hc_bytealign_S (w1[1], w1[2], offset); + w4[1] = hc_bytealign_S (w1[0], w1[1], offset); + w4[0] = hc_bytealign_S (w0[3], w1[0], offset); + w3[3] = hc_bytealign_S (w0[2], w0[3], offset); + w3[2] = hc_bytealign_S (w0[1], w0[2], offset); + w3[1] = hc_bytealign_S (w0[0], w0[1], offset); + w3[0] = hc_bytealign_S ( 0, w0[0], offset); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 13: + c3[1] = hc_bytealign_S (w7[3], 0, offset); + c3[0] = hc_bytealign_S (w7[2], w7[3], offset); + c2[3] = hc_bytealign_S (w7[1], w7[2], offset); + c2[2] = hc_bytealign_S (w7[0], w7[1], offset); + c2[1] = hc_bytealign_S (w6[3], w7[0], offset); + c2[0] = hc_bytealign_S (w6[2], w6[3], offset); + c1[3] = hc_bytealign_S (w6[1], w6[2], offset); + c1[2] = hc_bytealign_S (w6[0], w6[1], offset); + c1[1] = hc_bytealign_S (w5[3], w6[0], offset); + c1[0] = hc_bytealign_S (w5[2], w5[3], offset); + c0[3] = hc_bytealign_S (w5[1], w5[2], offset); + c0[2] = hc_bytealign_S (w5[0], w5[1], offset); + c0[1] = hc_bytealign_S (w4[3], w5[0], offset); + c0[0] = hc_bytealign_S (w4[2], w4[3], offset); + w7[3] = hc_bytealign_S (w4[1], w4[2], offset); + w7[2] = hc_bytealign_S (w4[0], w4[1], offset); + w7[1] = hc_bytealign_S (w3[3], w4[0], offset); + w7[0] = hc_bytealign_S (w3[2], w3[3], offset); + w6[3] = hc_bytealign_S (w3[1], w3[2], offset); + w6[2] = hc_bytealign_S (w3[0], w3[1], offset); + w6[1] = hc_bytealign_S (w2[3], w3[0], offset); + w6[0] = hc_bytealign_S (w2[2], w2[3], offset); + w5[3] = hc_bytealign_S (w2[1], w2[2], offset); + w5[2] = hc_bytealign_S (w2[0], w2[1], offset); + w5[1] = hc_bytealign_S (w1[3], w2[0], offset); + w5[0] = hc_bytealign_S (w1[2], w1[3], offset); + w4[3] = hc_bytealign_S (w1[1], w1[2], offset); + w4[2] = hc_bytealign_S (w1[0], w1[1], offset); + w4[1] = hc_bytealign_S (w0[3], w1[0], offset); + w4[0] = hc_bytealign_S (w0[2], w0[3], offset); + w3[3] = hc_bytealign_S (w0[1], w0[2], offset); + w3[2] = hc_bytealign_S (w0[0], w0[1], offset); + w3[1] = hc_bytealign_S ( 0, w0[0], offset); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 14: + c3[2] = hc_bytealign_S (w7[3], 0, offset); + c3[1] = hc_bytealign_S (w7[2], w7[3], offset); + c3[0] = hc_bytealign_S (w7[1], w7[2], offset); + c2[3] = hc_bytealign_S (w7[0], w7[1], offset); + c2[2] = hc_bytealign_S (w6[3], w7[0], offset); + c2[1] = hc_bytealign_S (w6[2], w6[3], offset); + c2[0] = hc_bytealign_S (w6[1], w6[2], offset); + c1[3] = hc_bytealign_S (w6[0], w6[1], offset); + c1[2] = hc_bytealign_S (w5[3], w6[0], offset); + c1[1] = hc_bytealign_S (w5[2], w5[3], offset); + c1[0] = hc_bytealign_S (w5[1], w5[2], offset); + c0[3] = hc_bytealign_S (w5[0], w5[1], offset); + c0[2] = hc_bytealign_S (w4[3], w5[0], offset); + c0[1] = hc_bytealign_S (w4[2], w4[3], offset); + c0[0] = hc_bytealign_S (w4[1], w4[2], offset); + w7[3] = hc_bytealign_S (w4[0], w4[1], offset); + w7[2] = hc_bytealign_S (w3[3], w4[0], offset); + w7[1] = hc_bytealign_S (w3[2], w3[3], offset); + w7[0] = hc_bytealign_S (w3[1], w3[2], offset); + w6[3] = hc_bytealign_S (w3[0], w3[1], offset); + w6[2] = hc_bytealign_S (w2[3], w3[0], offset); + w6[1] = hc_bytealign_S (w2[2], w2[3], offset); + w6[0] = hc_bytealign_S (w2[1], w2[2], offset); + w5[3] = hc_bytealign_S (w2[0], w2[1], offset); + w5[2] = hc_bytealign_S (w1[3], w2[0], offset); + w5[1] = hc_bytealign_S (w1[2], w1[3], offset); + w5[0] = hc_bytealign_S (w1[1], w1[2], offset); + w4[3] = hc_bytealign_S (w1[0], w1[1], offset); + w4[2] = hc_bytealign_S (w0[3], w1[0], offset); + w4[1] = hc_bytealign_S (w0[2], w0[3], offset); + w4[0] = hc_bytealign_S (w0[1], w0[2], offset); + w3[3] = hc_bytealign_S (w0[0], w0[1], offset); + w3[2] = hc_bytealign_S ( 0, w0[0], offset); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 15: + c3[3] = hc_bytealign_S (w7[3], 0, offset); + c3[2] = hc_bytealign_S (w7[2], w7[3], offset); + c3[1] = hc_bytealign_S (w7[1], w7[2], offset); + c3[0] = hc_bytealign_S (w7[0], w7[1], offset); + c2[3] = hc_bytealign_S (w6[3], w7[0], offset); + c2[2] = hc_bytealign_S (w6[2], w6[3], offset); + c2[1] = hc_bytealign_S (w6[1], w6[2], offset); + c2[0] = hc_bytealign_S (w6[0], w6[1], offset); + c1[3] = hc_bytealign_S (w5[3], w6[0], offset); + c1[2] = hc_bytealign_S (w5[2], w5[3], offset); + c1[1] = hc_bytealign_S (w5[1], w5[2], offset); + c1[0] = hc_bytealign_S (w5[0], w5[1], offset); + c0[3] = hc_bytealign_S (w4[3], w5[0], offset); + c0[2] = hc_bytealign_S (w4[2], w4[3], offset); + c0[1] = hc_bytealign_S (w4[1], w4[2], offset); + c0[0] = hc_bytealign_S (w4[0], w4[1], offset); + w7[3] = hc_bytealign_S (w3[3], w4[0], offset); + w7[2] = hc_bytealign_S (w3[2], w3[3], offset); + w7[1] = hc_bytealign_S (w3[1], w3[2], offset); + w7[0] = hc_bytealign_S (w3[0], w3[1], offset); + w6[3] = hc_bytealign_S (w2[3], w3[0], offset); + w6[2] = hc_bytealign_S (w2[2], w2[3], offset); + w6[1] = hc_bytealign_S (w2[1], w2[2], offset); + w6[0] = hc_bytealign_S (w2[0], w2[1], offset); + w5[3] = hc_bytealign_S (w1[3], w2[0], offset); + w5[2] = hc_bytealign_S (w1[2], w1[3], offset); + w5[1] = hc_bytealign_S (w1[1], w1[2], offset); + w5[0] = hc_bytealign_S (w1[0], w1[1], offset); + w4[3] = hc_bytealign_S (w0[3], w1[0], offset); + w4[2] = hc_bytealign_S (w0[2], w0[3], offset); + w4[1] = hc_bytealign_S (w0[1], w0[2], offset); + w4[0] = hc_bytealign_S (w0[0], w0[1], offset); + w3[3] = hc_bytealign_S ( 0, w0[0], offset); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 16: + c4[0] = hc_bytealign_S (w7[3], 0, offset); + c3[3] = hc_bytealign_S (w7[2], w7[3], offset); + c3[2] = hc_bytealign_S (w7[1], w7[2], offset); + c3[1] = hc_bytealign_S (w7[0], w7[1], offset); + c3[0] = hc_bytealign_S (w6[3], w7[0], offset); + c2[3] = hc_bytealign_S (w6[2], w6[3], offset); + c2[2] = hc_bytealign_S (w6[1], w6[2], offset); + c2[1] = hc_bytealign_S (w6[0], w6[1], offset); + c2[0] = hc_bytealign_S (w5[3], w6[0], offset); + c1[3] = hc_bytealign_S (w5[2], w5[3], offset); + c1[2] = hc_bytealign_S (w5[1], w5[2], offset); + c1[1] = hc_bytealign_S (w5[0], w5[1], offset); + c1[0] = hc_bytealign_S (w4[3], w5[0], offset); + c0[3] = hc_bytealign_S (w4[2], w4[3], offset); + c0[2] = hc_bytealign_S (w4[1], w4[2], offset); + c0[1] = hc_bytealign_S (w4[0], w4[1], offset); + c0[0] = hc_bytealign_S (w3[3], w4[0], offset); + w7[3] = hc_bytealign_S (w3[2], w3[3], offset); + w7[2] = hc_bytealign_S (w3[1], w3[2], offset); + w7[1] = hc_bytealign_S (w3[0], w3[1], offset); + w7[0] = hc_bytealign_S (w2[3], w3[0], offset); + w6[3] = hc_bytealign_S (w2[2], w2[3], offset); + w6[2] = hc_bytealign_S (w2[1], w2[2], offset); + w6[1] = hc_bytealign_S (w2[0], w2[1], offset); + w6[0] = hc_bytealign_S (w1[3], w2[0], offset); + w5[3] = hc_bytealign_S (w1[2], w1[3], offset); + w5[2] = hc_bytealign_S (w1[1], w1[2], offset); + w5[1] = hc_bytealign_S (w1[0], w1[1], offset); + w5[0] = hc_bytealign_S (w0[3], w1[0], offset); + w4[3] = hc_bytealign_S (w0[2], w0[3], offset); + w4[2] = hc_bytealign_S (w0[1], w0[2], offset); + w4[1] = hc_bytealign_S (w0[0], w0[1], offset); + w4[0] = hc_bytealign_S ( 0, w0[0], offset); + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 17: + c4[1] = hc_bytealign_S (w7[3], 0, offset); + c4[0] = hc_bytealign_S (w7[2], w7[3], offset); + c3[3] = hc_bytealign_S (w7[1], w7[2], offset); + c3[2] = hc_bytealign_S (w7[0], w7[1], offset); + c3[1] = hc_bytealign_S (w6[3], w7[0], offset); + c3[0] = hc_bytealign_S (w6[2], w6[3], offset); + c2[3] = hc_bytealign_S (w6[1], w6[2], offset); + c2[2] = hc_bytealign_S (w6[0], w6[1], offset); + c2[1] = hc_bytealign_S (w5[3], w6[0], offset); + c2[0] = hc_bytealign_S (w5[2], w5[3], offset); + c1[3] = hc_bytealign_S (w5[1], w5[2], offset); + c1[2] = hc_bytealign_S (w5[0], w5[1], offset); + c1[1] = hc_bytealign_S (w4[3], w5[0], offset); + c1[0] = hc_bytealign_S (w4[2], w4[3], offset); + c0[3] = hc_bytealign_S (w4[1], w4[2], offset); + c0[2] = hc_bytealign_S (w4[0], w4[1], offset); + c0[1] = hc_bytealign_S (w3[3], w4[0], offset); + c0[0] = hc_bytealign_S (w3[2], w3[3], offset); + w7[3] = hc_bytealign_S (w3[1], w3[2], offset); + w7[2] = hc_bytealign_S (w3[0], w3[1], offset); + w7[1] = hc_bytealign_S (w2[3], w3[0], offset); + w7[0] = hc_bytealign_S (w2[2], w2[3], offset); + w6[3] = hc_bytealign_S (w2[1], w2[2], offset); + w6[2] = hc_bytealign_S (w2[0], w2[1], offset); + w6[1] = hc_bytealign_S (w1[3], w2[0], offset); + w6[0] = hc_bytealign_S (w1[2], w1[3], offset); + w5[3] = hc_bytealign_S (w1[1], w1[2], offset); + w5[2] = hc_bytealign_S (w1[0], w1[1], offset); + w5[1] = hc_bytealign_S (w0[3], w1[0], offset); + w5[0] = hc_bytealign_S (w0[2], w0[3], offset); + w4[3] = hc_bytealign_S (w0[1], w0[2], offset); + w4[2] = hc_bytealign_S (w0[0], w0[1], offset); + w4[1] = hc_bytealign_S ( 0, w0[0], offset); + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 18: + c4[2] = hc_bytealign_S (w7[3], 0, offset); + c4[1] = hc_bytealign_S (w7[2], w7[3], offset); + c4[0] = hc_bytealign_S (w7[1], w7[2], offset); + c3[3] = hc_bytealign_S (w7[0], w7[1], offset); + c3[2] = hc_bytealign_S (w6[3], w7[0], offset); + c3[1] = hc_bytealign_S (w6[2], w6[3], offset); + c3[0] = hc_bytealign_S (w6[1], w6[2], offset); + c2[3] = hc_bytealign_S (w6[0], w6[1], offset); + c2[2] = hc_bytealign_S (w5[3], w6[0], offset); + c2[1] = hc_bytealign_S (w5[2], w5[3], offset); + c2[0] = hc_bytealign_S (w5[1], w5[2], offset); + c1[3] = hc_bytealign_S (w5[0], w5[1], offset); + c1[2] = hc_bytealign_S (w4[3], w5[0], offset); + c1[1] = hc_bytealign_S (w4[2], w4[3], offset); + c1[0] = hc_bytealign_S (w4[1], w4[2], offset); + c0[3] = hc_bytealign_S (w4[0], w4[1], offset); + c0[2] = hc_bytealign_S (w3[3], w4[0], offset); + c0[1] = hc_bytealign_S (w3[2], w3[3], offset); + c0[0] = hc_bytealign_S (w3[1], w3[2], offset); + w7[3] = hc_bytealign_S (w3[0], w3[1], offset); + w7[2] = hc_bytealign_S (w2[3], w3[0], offset); + w7[1] = hc_bytealign_S (w2[2], w2[3], offset); + w7[0] = hc_bytealign_S (w2[1], w2[2], offset); + w6[3] = hc_bytealign_S (w2[0], w2[1], offset); + w6[2] = hc_bytealign_S (w1[3], w2[0], offset); + w6[1] = hc_bytealign_S (w1[2], w1[3], offset); + w6[0] = hc_bytealign_S (w1[1], w1[2], offset); + w5[3] = hc_bytealign_S (w1[0], w1[1], offset); + w5[2] = hc_bytealign_S (w0[3], w1[0], offset); + w5[1] = hc_bytealign_S (w0[2], w0[3], offset); + w5[0] = hc_bytealign_S (w0[1], w0[2], offset); + w4[3] = hc_bytealign_S (w0[0], w0[1], offset); + w4[2] = hc_bytealign_S ( 0, w0[0], offset); + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 19: + c4[3] = hc_bytealign_S (w7[3], 0, offset); + c4[2] = hc_bytealign_S (w7[2], w7[3], offset); + c4[1] = hc_bytealign_S (w7[1], w7[2], offset); + c4[0] = hc_bytealign_S (w7[0], w7[1], offset); + c3[3] = hc_bytealign_S (w6[3], w7[0], offset); + c3[2] = hc_bytealign_S (w6[2], w6[3], offset); + c3[1] = hc_bytealign_S (w6[1], w6[2], offset); + c3[0] = hc_bytealign_S (w6[0], w6[1], offset); + c2[3] = hc_bytealign_S (w5[3], w6[0], offset); + c2[2] = hc_bytealign_S (w5[2], w5[3], offset); + c2[1] = hc_bytealign_S (w5[1], w5[2], offset); + c2[0] = hc_bytealign_S (w5[0], w5[1], offset); + c1[3] = hc_bytealign_S (w4[3], w5[0], offset); + c1[2] = hc_bytealign_S (w4[2], w4[3], offset); + c1[1] = hc_bytealign_S (w4[1], w4[2], offset); + c1[0] = hc_bytealign_S (w4[0], w4[1], offset); + c0[3] = hc_bytealign_S (w3[3], w4[0], offset); + c0[2] = hc_bytealign_S (w3[2], w3[3], offset); + c0[1] = hc_bytealign_S (w3[1], w3[2], offset); + c0[0] = hc_bytealign_S (w3[0], w3[1], offset); + w7[3] = hc_bytealign_S (w2[3], w3[0], offset); + w7[2] = hc_bytealign_S (w2[2], w2[3], offset); + w7[1] = hc_bytealign_S (w2[1], w2[2], offset); + w7[0] = hc_bytealign_S (w2[0], w2[1], offset); + w6[3] = hc_bytealign_S (w1[3], w2[0], offset); + w6[2] = hc_bytealign_S (w1[2], w1[3], offset); + w6[1] = hc_bytealign_S (w1[1], w1[2], offset); + w6[0] = hc_bytealign_S (w1[0], w1[1], offset); + w5[3] = hc_bytealign_S (w0[3], w1[0], offset); + w5[2] = hc_bytealign_S (w0[2], w0[3], offset); + w5[1] = hc_bytealign_S (w0[1], w0[2], offset); + w5[0] = hc_bytealign_S (w0[0], w0[1], offset); + w4[3] = hc_bytealign_S ( 0, w0[0], offset); + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 20: + c5[0] = hc_bytealign_S (w7[3], 0, offset); + c4[3] = hc_bytealign_S (w7[2], w7[3], offset); + c4[2] = hc_bytealign_S (w7[1], w7[2], offset); + c4[1] = hc_bytealign_S (w7[0], w7[1], offset); + c4[0] = hc_bytealign_S (w6[3], w7[0], offset); + c3[3] = hc_bytealign_S (w6[2], w6[3], offset); + c3[2] = hc_bytealign_S (w6[1], w6[2], offset); + c3[1] = hc_bytealign_S (w6[0], w6[1], offset); + c3[0] = hc_bytealign_S (w5[3], w6[0], offset); + c2[3] = hc_bytealign_S (w5[2], w5[3], offset); + c2[2] = hc_bytealign_S (w5[1], w5[2], offset); + c2[1] = hc_bytealign_S (w5[0], w5[1], offset); + c2[0] = hc_bytealign_S (w4[3], w5[0], offset); + c1[3] = hc_bytealign_S (w4[2], w4[3], offset); + c1[2] = hc_bytealign_S (w4[1], w4[2], offset); + c1[1] = hc_bytealign_S (w4[0], w4[1], offset); + c1[0] = hc_bytealign_S (w3[3], w4[0], offset); + c0[3] = hc_bytealign_S (w3[2], w3[3], offset); + c0[2] = hc_bytealign_S (w3[1], w3[2], offset); + c0[1] = hc_bytealign_S (w3[0], w3[1], offset); + c0[0] = hc_bytealign_S (w2[3], w3[0], offset); + w7[3] = hc_bytealign_S (w2[2], w2[3], offset); + w7[2] = hc_bytealign_S (w2[1], w2[2], offset); + w7[1] = hc_bytealign_S (w2[0], w2[1], offset); + w7[0] = hc_bytealign_S (w1[3], w2[0], offset); + w6[3] = hc_bytealign_S (w1[2], w1[3], offset); + w6[2] = hc_bytealign_S (w1[1], w1[2], offset); + w6[1] = hc_bytealign_S (w1[0], w1[1], offset); + w6[0] = hc_bytealign_S (w0[3], w1[0], offset); + w5[3] = hc_bytealign_S (w0[2], w0[3], offset); + w5[2] = hc_bytealign_S (w0[1], w0[2], offset); + w5[1] = hc_bytealign_S (w0[0], w0[1], offset); + w5[0] = hc_bytealign_S ( 0, w0[0], offset); + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 21: + c5[1] = hc_bytealign_S (w7[3], 0, offset); + c5[0] = hc_bytealign_S (w7[2], w7[3], offset); + c4[3] = hc_bytealign_S (w7[1], w7[2], offset); + c4[2] = hc_bytealign_S (w7[0], w7[1], offset); + c4[1] = hc_bytealign_S (w6[3], w7[0], offset); + c4[0] = hc_bytealign_S (w6[2], w6[3], offset); + c3[3] = hc_bytealign_S (w6[1], w6[2], offset); + c3[2] = hc_bytealign_S (w6[0], w6[1], offset); + c3[1] = hc_bytealign_S (w5[3], w6[0], offset); + c3[0] = hc_bytealign_S (w5[2], w5[3], offset); + c2[3] = hc_bytealign_S (w5[1], w5[2], offset); + c2[2] = hc_bytealign_S (w5[0], w5[1], offset); + c2[1] = hc_bytealign_S (w4[3], w5[0], offset); + c2[0] = hc_bytealign_S (w4[2], w4[3], offset); + c1[3] = hc_bytealign_S (w4[1], w4[2], offset); + c1[2] = hc_bytealign_S (w4[0], w4[1], offset); + c1[1] = hc_bytealign_S (w3[3], w4[0], offset); + c1[0] = hc_bytealign_S (w3[2], w3[3], offset); + c0[3] = hc_bytealign_S (w3[1], w3[2], offset); + c0[2] = hc_bytealign_S (w3[0], w3[1], offset); + c0[1] = hc_bytealign_S (w2[3], w3[0], offset); + c0[0] = hc_bytealign_S (w2[2], w2[3], offset); + w7[3] = hc_bytealign_S (w2[1], w2[2], offset); + w7[2] = hc_bytealign_S (w2[0], w2[1], offset); + w7[1] = hc_bytealign_S (w1[3], w2[0], offset); + w7[0] = hc_bytealign_S (w1[2], w1[3], offset); + w6[3] = hc_bytealign_S (w1[1], w1[2], offset); + w6[2] = hc_bytealign_S (w1[0], w1[1], offset); + w6[1] = hc_bytealign_S (w0[3], w1[0], offset); + w6[0] = hc_bytealign_S (w0[2], w0[3], offset); + w5[3] = hc_bytealign_S (w0[1], w0[2], offset); + w5[2] = hc_bytealign_S (w0[0], w0[1], offset); + w5[1] = hc_bytealign_S ( 0, w0[0], offset); + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 22: + c5[2] = hc_bytealign_S (w7[3], 0, offset); + c5[1] = hc_bytealign_S (w7[2], w7[3], offset); + c5[0] = hc_bytealign_S (w7[1], w7[2], offset); + c4[3] = hc_bytealign_S (w7[0], w7[1], offset); + c4[2] = hc_bytealign_S (w6[3], w7[0], offset); + c4[1] = hc_bytealign_S (w6[2], w6[3], offset); + c4[0] = hc_bytealign_S (w6[1], w6[2], offset); + c3[3] = hc_bytealign_S (w6[0], w6[1], offset); + c3[2] = hc_bytealign_S (w5[3], w6[0], offset); + c3[1] = hc_bytealign_S (w5[2], w5[3], offset); + c3[0] = hc_bytealign_S (w5[1], w5[2], offset); + c2[3] = hc_bytealign_S (w5[0], w5[1], offset); + c2[2] = hc_bytealign_S (w4[3], w5[0], offset); + c2[1] = hc_bytealign_S (w4[2], w4[3], offset); + c2[0] = hc_bytealign_S (w4[1], w4[2], offset); + c1[3] = hc_bytealign_S (w4[0], w4[1], offset); + c1[2] = hc_bytealign_S (w3[3], w4[0], offset); + c1[1] = hc_bytealign_S (w3[2], w3[3], offset); + c1[0] = hc_bytealign_S (w3[1], w3[2], offset); + c0[3] = hc_bytealign_S (w3[0], w3[1], offset); + c0[2] = hc_bytealign_S (w2[3], w3[0], offset); + c0[1] = hc_bytealign_S (w2[2], w2[3], offset); + c0[0] = hc_bytealign_S (w2[1], w2[2], offset); + w7[3] = hc_bytealign_S (w2[0], w2[1], offset); + w7[2] = hc_bytealign_S (w1[3], w2[0], offset); + w7[1] = hc_bytealign_S (w1[2], w1[3], offset); + w7[0] = hc_bytealign_S (w1[1], w1[2], offset); + w6[3] = hc_bytealign_S (w1[0], w1[1], offset); + w6[2] = hc_bytealign_S (w0[3], w1[0], offset); + w6[1] = hc_bytealign_S (w0[2], w0[3], offset); + w6[0] = hc_bytealign_S (w0[1], w0[2], offset); + w5[3] = hc_bytealign_S (w0[0], w0[1], offset); + w5[2] = hc_bytealign_S ( 0, w0[0], offset); + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 23: + c5[3] = hc_bytealign_S (w7[3], 0, offset); + c5[2] = hc_bytealign_S (w7[2], w7[3], offset); + c5[1] = hc_bytealign_S (w7[1], w7[2], offset); + c5[0] = hc_bytealign_S (w7[0], w7[1], offset); + c4[3] = hc_bytealign_S (w6[3], w7[0], offset); + c4[2] = hc_bytealign_S (w6[2], w6[3], offset); + c4[1] = hc_bytealign_S (w6[1], w6[2], offset); + c4[0] = hc_bytealign_S (w6[0], w6[1], offset); + c3[3] = hc_bytealign_S (w5[3], w6[0], offset); + c3[2] = hc_bytealign_S (w5[2], w5[3], offset); + c3[1] = hc_bytealign_S (w5[1], w5[2], offset); + c3[0] = hc_bytealign_S (w5[0], w5[1], offset); + c2[3] = hc_bytealign_S (w4[3], w5[0], offset); + c2[2] = hc_bytealign_S (w4[2], w4[3], offset); + c2[1] = hc_bytealign_S (w4[1], w4[2], offset); + c2[0] = hc_bytealign_S (w4[0], w4[1], offset); + c1[3] = hc_bytealign_S (w3[3], w4[0], offset); + c1[2] = hc_bytealign_S (w3[2], w3[3], offset); + c1[1] = hc_bytealign_S (w3[1], w3[2], offset); + c1[0] = hc_bytealign_S (w3[0], w3[1], offset); + c0[3] = hc_bytealign_S (w2[3], w3[0], offset); + c0[2] = hc_bytealign_S (w2[2], w2[3], offset); + c0[1] = hc_bytealign_S (w2[1], w2[2], offset); + c0[0] = hc_bytealign_S (w2[0], w2[1], offset); + w7[3] = hc_bytealign_S (w1[3], w2[0], offset); + w7[2] = hc_bytealign_S (w1[2], w1[3], offset); + w7[1] = hc_bytealign_S (w1[1], w1[2], offset); + w7[0] = hc_bytealign_S (w1[0], w1[1], offset); + w6[3] = hc_bytealign_S (w0[3], w1[0], offset); + w6[2] = hc_bytealign_S (w0[2], w0[3], offset); + w6[1] = hc_bytealign_S (w0[1], w0[2], offset); + w6[0] = hc_bytealign_S (w0[0], w0[1], offset); + w5[3] = hc_bytealign_S ( 0, w0[0], offset); + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 24: + c6[0] = hc_bytealign_S (w7[3], 0, offset); + c5[3] = hc_bytealign_S (w7[2], w7[3], offset); + c5[2] = hc_bytealign_S (w7[1], w7[2], offset); + c5[1] = hc_bytealign_S (w7[0], w7[1], offset); + c5[0] = hc_bytealign_S (w6[3], w7[0], offset); + c4[3] = hc_bytealign_S (w6[2], w6[3], offset); + c4[2] = hc_bytealign_S (w6[1], w6[2], offset); + c4[1] = hc_bytealign_S (w6[0], w6[1], offset); + c4[0] = hc_bytealign_S (w5[3], w6[0], offset); + c3[3] = hc_bytealign_S (w5[2], w5[3], offset); + c3[2] = hc_bytealign_S (w5[1], w5[2], offset); + c3[1] = hc_bytealign_S (w5[0], w5[1], offset); + c3[0] = hc_bytealign_S (w4[3], w5[0], offset); + c2[3] = hc_bytealign_S (w4[2], w4[3], offset); + c2[2] = hc_bytealign_S (w4[1], w4[2], offset); + c2[1] = hc_bytealign_S (w4[0], w4[1], offset); + c2[0] = hc_bytealign_S (w3[3], w4[0], offset); + c1[3] = hc_bytealign_S (w3[2], w3[3], offset); + c1[2] = hc_bytealign_S (w3[1], w3[2], offset); + c1[1] = hc_bytealign_S (w3[0], w3[1], offset); + c1[0] = hc_bytealign_S (w2[3], w3[0], offset); + c0[3] = hc_bytealign_S (w2[2], w2[3], offset); + c0[2] = hc_bytealign_S (w2[1], w2[2], offset); + c0[1] = hc_bytealign_S (w2[0], w2[1], offset); + c0[0] = hc_bytealign_S (w1[3], w2[0], offset); + w7[3] = hc_bytealign_S (w1[2], w1[3], offset); + w7[2] = hc_bytealign_S (w1[1], w1[2], offset); + w7[1] = hc_bytealign_S (w1[0], w1[1], offset); + w7[0] = hc_bytealign_S (w0[3], w1[0], offset); + w6[3] = hc_bytealign_S (w0[2], w0[3], offset); + w6[2] = hc_bytealign_S (w0[1], w0[2], offset); + w6[1] = hc_bytealign_S (w0[0], w0[1], offset); + w6[0] = hc_bytealign_S ( 0, w0[0], offset); + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 25: + c6[1] = hc_bytealign_S (w7[3], 0, offset); + c6[0] = hc_bytealign_S (w7[2], w7[3], offset); + c5[3] = hc_bytealign_S (w7[1], w7[2], offset); + c5[2] = hc_bytealign_S (w7[0], w7[1], offset); + c5[1] = hc_bytealign_S (w6[3], w7[0], offset); + c5[0] = hc_bytealign_S (w6[2], w6[3], offset); + c4[3] = hc_bytealign_S (w6[1], w6[2], offset); + c4[2] = hc_bytealign_S (w6[0], w6[1], offset); + c4[1] = hc_bytealign_S (w5[3], w6[0], offset); + c4[0] = hc_bytealign_S (w5[2], w5[3], offset); + c3[3] = hc_bytealign_S (w5[1], w5[2], offset); + c3[2] = hc_bytealign_S (w5[0], w5[1], offset); + c3[1] = hc_bytealign_S (w4[3], w5[0], offset); + c3[0] = hc_bytealign_S (w4[2], w4[3], offset); + c2[3] = hc_bytealign_S (w4[1], w4[2], offset); + c2[2] = hc_bytealign_S (w4[0], w4[1], offset); + c2[1] = hc_bytealign_S (w3[3], w4[0], offset); + c2[0] = hc_bytealign_S (w3[2], w3[3], offset); + c1[3] = hc_bytealign_S (w3[1], w3[2], offset); + c1[2] = hc_bytealign_S (w3[0], w3[1], offset); + c1[1] = hc_bytealign_S (w2[3], w3[0], offset); + c1[0] = hc_bytealign_S (w2[2], w2[3], offset); + c0[3] = hc_bytealign_S (w2[1], w2[2], offset); + c0[2] = hc_bytealign_S (w2[0], w2[1], offset); + c0[1] = hc_bytealign_S (w1[3], w2[0], offset); + c0[0] = hc_bytealign_S (w1[2], w1[3], offset); + w7[3] = hc_bytealign_S (w1[1], w1[2], offset); + w7[2] = hc_bytealign_S (w1[0], w1[1], offset); + w7[1] = hc_bytealign_S (w0[3], w1[0], offset); + w7[0] = hc_bytealign_S (w0[2], w0[3], offset); + w6[3] = hc_bytealign_S (w0[1], w0[2], offset); + w6[2] = hc_bytealign_S (w0[0], w0[1], offset); + w6[1] = hc_bytealign_S ( 0, w0[0], offset); + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 26: + c6[2] = hc_bytealign_S (w7[3], 0, offset); + c6[1] = hc_bytealign_S (w7[2], w7[3], offset); + c6[0] = hc_bytealign_S (w7[1], w7[2], offset); + c5[3] = hc_bytealign_S (w7[0], w7[1], offset); + c5[2] = hc_bytealign_S (w6[3], w7[0], offset); + c5[1] = hc_bytealign_S (w6[2], w6[3], offset); + c5[0] = hc_bytealign_S (w6[1], w6[2], offset); + c4[3] = hc_bytealign_S (w6[0], w6[1], offset); + c4[2] = hc_bytealign_S (w5[3], w6[0], offset); + c4[1] = hc_bytealign_S (w5[2], w5[3], offset); + c4[0] = hc_bytealign_S (w5[1], w5[2], offset); + c3[3] = hc_bytealign_S (w5[0], w5[1], offset); + c3[2] = hc_bytealign_S (w4[3], w5[0], offset); + c3[1] = hc_bytealign_S (w4[2], w4[3], offset); + c3[0] = hc_bytealign_S (w4[1], w4[2], offset); + c2[3] = hc_bytealign_S (w4[0], w4[1], offset); + c2[2] = hc_bytealign_S (w3[3], w4[0], offset); + c2[1] = hc_bytealign_S (w3[2], w3[3], offset); + c2[0] = hc_bytealign_S (w3[1], w3[2], offset); + c1[3] = hc_bytealign_S (w3[0], w3[1], offset); + c1[2] = hc_bytealign_S (w2[3], w3[0], offset); + c1[1] = hc_bytealign_S (w2[2], w2[3], offset); + c1[0] = hc_bytealign_S (w2[1], w2[2], offset); + c0[3] = hc_bytealign_S (w2[0], w2[1], offset); + c0[2] = hc_bytealign_S (w1[3], w2[0], offset); + c0[1] = hc_bytealign_S (w1[2], w1[3], offset); + c0[0] = hc_bytealign_S (w1[1], w1[2], offset); + w7[3] = hc_bytealign_S (w1[0], w1[1], offset); + w7[2] = hc_bytealign_S (w0[3], w1[0], offset); + w7[1] = hc_bytealign_S (w0[2], w0[3], offset); + w7[0] = hc_bytealign_S (w0[1], w0[2], offset); + w6[3] = hc_bytealign_S (w0[0], w0[1], offset); + w6[2] = hc_bytealign_S ( 0, w0[0], offset); + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 27: + c6[3] = hc_bytealign_S (w7[3], 0, offset); + c6[2] = hc_bytealign_S (w7[2], w7[3], offset); + c6[1] = hc_bytealign_S (w7[1], w7[2], offset); + c6[0] = hc_bytealign_S (w7[0], w7[1], offset); + c5[3] = hc_bytealign_S (w6[3], w7[0], offset); + c5[2] = hc_bytealign_S (w6[2], w6[3], offset); + c5[1] = hc_bytealign_S (w6[1], w6[2], offset); + c5[0] = hc_bytealign_S (w6[0], w6[1], offset); + c4[3] = hc_bytealign_S (w5[3], w6[0], offset); + c4[2] = hc_bytealign_S (w5[2], w5[3], offset); + c4[1] = hc_bytealign_S (w5[1], w5[2], offset); + c4[0] = hc_bytealign_S (w5[0], w5[1], offset); + c3[3] = hc_bytealign_S (w4[3], w5[0], offset); + c3[2] = hc_bytealign_S (w4[2], w4[3], offset); + c3[1] = hc_bytealign_S (w4[1], w4[2], offset); + c3[0] = hc_bytealign_S (w4[0], w4[1], offset); + c2[3] = hc_bytealign_S (w3[3], w4[0], offset); + c2[2] = hc_bytealign_S (w3[2], w3[3], offset); + c2[1] = hc_bytealign_S (w3[1], w3[2], offset); + c2[0] = hc_bytealign_S (w3[0], w3[1], offset); + c1[3] = hc_bytealign_S (w2[3], w3[0], offset); + c1[2] = hc_bytealign_S (w2[2], w2[3], offset); + c1[1] = hc_bytealign_S (w2[1], w2[2], offset); + c1[0] = hc_bytealign_S (w2[0], w2[1], offset); + c0[3] = hc_bytealign_S (w1[3], w2[0], offset); + c0[2] = hc_bytealign_S (w1[2], w1[3], offset); + c0[1] = hc_bytealign_S (w1[1], w1[2], offset); + c0[0] = hc_bytealign_S (w1[0], w1[1], offset); + w7[3] = hc_bytealign_S (w0[3], w1[0], offset); + w7[2] = hc_bytealign_S (w0[2], w0[3], offset); + w7[1] = hc_bytealign_S (w0[1], w0[2], offset); + w7[0] = hc_bytealign_S (w0[0], w0[1], offset); + w6[3] = hc_bytealign_S ( 0, w0[0], offset); + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 28: + c7[0] = hc_bytealign_S (w7[3], 0, offset); + c6[3] = hc_bytealign_S (w7[2], w7[3], offset); + c6[2] = hc_bytealign_S (w7[1], w7[2], offset); + c6[1] = hc_bytealign_S (w7[0], w7[1], offset); + c6[0] = hc_bytealign_S (w6[3], w7[0], offset); + c5[3] = hc_bytealign_S (w6[2], w6[3], offset); + c5[2] = hc_bytealign_S (w6[1], w6[2], offset); + c5[1] = hc_bytealign_S (w6[0], w6[1], offset); + c5[0] = hc_bytealign_S (w5[3], w6[0], offset); + c4[3] = hc_bytealign_S (w5[2], w5[3], offset); + c4[2] = hc_bytealign_S (w5[1], w5[2], offset); + c4[1] = hc_bytealign_S (w5[0], w5[1], offset); + c4[0] = hc_bytealign_S (w4[3], w5[0], offset); + c3[3] = hc_bytealign_S (w4[2], w4[3], offset); + c3[2] = hc_bytealign_S (w4[1], w4[2], offset); + c3[1] = hc_bytealign_S (w4[0], w4[1], offset); + c3[0] = hc_bytealign_S (w3[3], w4[0], offset); + c2[3] = hc_bytealign_S (w3[2], w3[3], offset); + c2[2] = hc_bytealign_S (w3[1], w3[2], offset); + c2[1] = hc_bytealign_S (w3[0], w3[1], offset); + c2[0] = hc_bytealign_S (w2[3], w3[0], offset); + c1[3] = hc_bytealign_S (w2[2], w2[3], offset); + c1[2] = hc_bytealign_S (w2[1], w2[2], offset); + c1[1] = hc_bytealign_S (w2[0], w2[1], offset); + c1[0] = hc_bytealign_S (w1[3], w2[0], offset); + c0[3] = hc_bytealign_S (w1[2], w1[3], offset); + c0[2] = hc_bytealign_S (w1[1], w1[2], offset); + c0[1] = hc_bytealign_S (w1[0], w1[1], offset); + c0[0] = hc_bytealign_S (w0[3], w1[0], offset); + w7[3] = hc_bytealign_S (w0[2], w0[3], offset); + w7[2] = hc_bytealign_S (w0[1], w0[2], offset); + w7[1] = hc_bytealign_S (w0[0], w0[1], offset); + w7[0] = hc_bytealign_S ( 0, w0[0], offset); + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 29: + c7[1] = hc_bytealign_S (w7[3], 0, offset); + c7[0] = hc_bytealign_S (w7[2], w7[3], offset); + c6[3] = hc_bytealign_S (w7[1], w7[2], offset); + c6[2] = hc_bytealign_S (w7[0], w7[1], offset); + c6[1] = hc_bytealign_S (w6[3], w7[0], offset); + c6[0] = hc_bytealign_S (w6[2], w6[3], offset); + c5[3] = hc_bytealign_S (w6[1], w6[2], offset); + c5[2] = hc_bytealign_S (w6[0], w6[1], offset); + c5[1] = hc_bytealign_S (w5[3], w6[0], offset); + c5[0] = hc_bytealign_S (w5[2], w5[3], offset); + c4[3] = hc_bytealign_S (w5[1], w5[2], offset); + c4[2] = hc_bytealign_S (w5[0], w5[1], offset); + c4[1] = hc_bytealign_S (w4[3], w5[0], offset); + c4[0] = hc_bytealign_S (w4[2], w4[3], offset); + c3[3] = hc_bytealign_S (w4[1], w4[2], offset); + c3[2] = hc_bytealign_S (w4[0], w4[1], offset); + c3[1] = hc_bytealign_S (w3[3], w4[0], offset); + c3[0] = hc_bytealign_S (w3[2], w3[3], offset); + c2[3] = hc_bytealign_S (w3[1], w3[2], offset); + c2[2] = hc_bytealign_S (w3[0], w3[1], offset); + c2[1] = hc_bytealign_S (w2[3], w3[0], offset); + c2[0] = hc_bytealign_S (w2[2], w2[3], offset); + c1[3] = hc_bytealign_S (w2[1], w2[2], offset); + c1[2] = hc_bytealign_S (w2[0], w2[1], offset); + c1[1] = hc_bytealign_S (w1[3], w2[0], offset); + c1[0] = hc_bytealign_S (w1[2], w1[3], offset); + c0[3] = hc_bytealign_S (w1[1], w1[2], offset); + c0[2] = hc_bytealign_S (w1[0], w1[1], offset); + c0[1] = hc_bytealign_S (w0[3], w1[0], offset); + c0[0] = hc_bytealign_S (w0[2], w0[3], offset); + w7[3] = hc_bytealign_S (w0[1], w0[2], offset); + w7[2] = hc_bytealign_S (w0[0], w0[1], offset); + w7[1] = hc_bytealign_S ( 0, w0[0], offset); + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 30: + c7[2] = hc_bytealign_S (w7[3], 0, offset); + c7[1] = hc_bytealign_S (w7[2], w7[3], offset); + c7[0] = hc_bytealign_S (w7[1], w7[2], offset); + c6[3] = hc_bytealign_S (w7[0], w7[1], offset); + c6[2] = hc_bytealign_S (w6[3], w7[0], offset); + c6[1] = hc_bytealign_S (w6[2], w6[3], offset); + c6[0] = hc_bytealign_S (w6[1], w6[2], offset); + c5[3] = hc_bytealign_S (w6[0], w6[1], offset); + c5[2] = hc_bytealign_S (w5[3], w6[0], offset); + c5[1] = hc_bytealign_S (w5[2], w5[3], offset); + c5[0] = hc_bytealign_S (w5[1], w5[2], offset); + c4[3] = hc_bytealign_S (w5[0], w5[1], offset); + c4[2] = hc_bytealign_S (w4[3], w5[0], offset); + c4[1] = hc_bytealign_S (w4[2], w4[3], offset); + c4[0] = hc_bytealign_S (w4[1], w4[2], offset); + c3[3] = hc_bytealign_S (w4[0], w4[1], offset); + c3[2] = hc_bytealign_S (w3[3], w4[0], offset); + c3[1] = hc_bytealign_S (w3[2], w3[3], offset); + c3[0] = hc_bytealign_S (w3[1], w3[2], offset); + c2[3] = hc_bytealign_S (w3[0], w3[1], offset); + c2[2] = hc_bytealign_S (w2[3], w3[0], offset); + c2[1] = hc_bytealign_S (w2[2], w2[3], offset); + c2[0] = hc_bytealign_S (w2[1], w2[2], offset); + c1[3] = hc_bytealign_S (w2[0], w2[1], offset); + c1[2] = hc_bytealign_S (w1[3], w2[0], offset); + c1[1] = hc_bytealign_S (w1[2], w1[3], offset); + c1[0] = hc_bytealign_S (w1[1], w1[2], offset); + c0[3] = hc_bytealign_S (w1[0], w1[1], offset); + c0[2] = hc_bytealign_S (w0[3], w1[0], offset); + c0[1] = hc_bytealign_S (w0[2], w0[3], offset); + c0[0] = hc_bytealign_S (w0[1], w0[2], offset); + w7[3] = hc_bytealign_S (w0[0], w0[1], offset); + w7[2] = hc_bytealign_S ( 0, w0[0], offset); + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 31: + c7[3] = hc_bytealign_S (w7[3], 0, offset); + c7[2] = hc_bytealign_S (w7[2], w7[3], offset); + c7[1] = hc_bytealign_S (w7[1], w7[2], offset); + c7[0] = hc_bytealign_S (w7[0], w7[1], offset); + c6[3] = hc_bytealign_S (w6[3], w7[0], offset); + c6[2] = hc_bytealign_S (w6[2], w6[3], offset); + c6[1] = hc_bytealign_S (w6[1], w6[2], offset); + c6[0] = hc_bytealign_S (w6[0], w6[1], offset); + c5[3] = hc_bytealign_S (w5[3], w6[0], offset); + c5[2] = hc_bytealign_S (w5[2], w5[3], offset); + c5[1] = hc_bytealign_S (w5[1], w5[2], offset); + c5[0] = hc_bytealign_S (w5[0], w5[1], offset); + c4[3] = hc_bytealign_S (w4[3], w5[0], offset); + c4[2] = hc_bytealign_S (w4[2], w4[3], offset); + c4[1] = hc_bytealign_S (w4[1], w4[2], offset); + c4[0] = hc_bytealign_S (w4[0], w4[1], offset); + c3[3] = hc_bytealign_S (w3[3], w4[0], offset); + c3[2] = hc_bytealign_S (w3[2], w3[3], offset); + c3[1] = hc_bytealign_S (w3[1], w3[2], offset); + c3[0] = hc_bytealign_S (w3[0], w3[1], offset); + c2[3] = hc_bytealign_S (w2[3], w3[0], offset); + c2[2] = hc_bytealign_S (w2[2], w2[3], offset); + c2[1] = hc_bytealign_S (w2[1], w2[2], offset); + c2[0] = hc_bytealign_S (w2[0], w2[1], offset); + c1[3] = hc_bytealign_S (w1[3], w2[0], offset); + c1[2] = hc_bytealign_S (w1[2], w1[3], offset); + c1[1] = hc_bytealign_S (w1[1], w1[2], offset); + c1[0] = hc_bytealign_S (w1[0], w1[1], offset); + c0[3] = hc_bytealign_S (w0[3], w1[0], offset); + c0[2] = hc_bytealign_S (w0[2], w0[3], offset); + c0[1] = hc_bytealign_S (w0[1], w0[2], offset); + c0[0] = hc_bytealign_S (w0[0], w0[1], offset); + w7[3] = hc_bytealign_S ( 0, w0[0], offset); + w7[2] = 0; + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + } + #endif + + #if (defined IS_AMD && HAS_VPERM == 1) || defined IS_NV + + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + + #if defined IS_NV + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + #endif + + #if defined IS_AMD + const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); + #endif + + switch (offset_switch) + { + case 0: + c0[0] = hc_byte_perm_S (w7[3], 0, selector); + w7[3] = hc_byte_perm_S (w7[2], w7[3], selector); + w7[2] = hc_byte_perm_S (w7[1], w7[2], selector); + w7[1] = hc_byte_perm_S (w7[0], w7[1], selector); + w7[0] = hc_byte_perm_S (w6[3], w7[0], selector); + w6[3] = hc_byte_perm_S (w6[2], w6[3], selector); + w6[2] = hc_byte_perm_S (w6[1], w6[2], selector); + w6[1] = hc_byte_perm_S (w6[0], w6[1], selector); + w6[0] = hc_byte_perm_S (w5[3], w6[0], selector); + w5[3] = hc_byte_perm_S (w5[2], w5[3], selector); + w5[2] = hc_byte_perm_S (w5[1], w5[2], selector); + w5[1] = hc_byte_perm_S (w5[0], w5[1], selector); + w5[0] = hc_byte_perm_S (w4[3], w5[0], selector); + w4[3] = hc_byte_perm_S (w4[2], w4[3], selector); + w4[2] = hc_byte_perm_S (w4[1], w4[2], selector); + w4[1] = hc_byte_perm_S (w4[0], w4[1], selector); + w4[0] = hc_byte_perm_S (w3[3], w4[0], selector); + w3[3] = hc_byte_perm_S (w3[2], w3[3], selector); + w3[2] = hc_byte_perm_S (w3[1], w3[2], selector); + w3[1] = hc_byte_perm_S (w3[0], w3[1], selector); + w3[0] = hc_byte_perm_S (w2[3], w3[0], selector); + w2[3] = hc_byte_perm_S (w2[2], w2[3], selector); + w2[2] = hc_byte_perm_S (w2[1], w2[2], selector); + w2[1] = hc_byte_perm_S (w2[0], w2[1], selector); + w2[0] = hc_byte_perm_S (w1[3], w2[0], selector); + w1[3] = hc_byte_perm_S (w1[2], w1[3], selector); + w1[2] = hc_byte_perm_S (w1[1], w1[2], selector); + w1[1] = hc_byte_perm_S (w1[0], w1[1], selector); + w1[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w0[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w0[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w0[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w0[0] = hc_byte_perm_S ( 0, w0[0], selector); + + break; + + case 1: + c0[1] = hc_byte_perm_S (w7[3], 0, selector); + c0[0] = hc_byte_perm_S (w7[2], w7[3], selector); + w7[3] = hc_byte_perm_S (w7[1], w7[2], selector); + w7[2] = hc_byte_perm_S (w7[0], w7[1], selector); + w7[1] = hc_byte_perm_S (w6[3], w7[0], selector); + w7[0] = hc_byte_perm_S (w6[2], w6[3], selector); + w6[3] = hc_byte_perm_S (w6[1], w6[2], selector); + w6[2] = hc_byte_perm_S (w6[0], w6[1], selector); + w6[1] = hc_byte_perm_S (w5[3], w6[0], selector); + w6[0] = hc_byte_perm_S (w5[2], w5[3], selector); + w5[3] = hc_byte_perm_S (w5[1], w5[2], selector); + w5[2] = hc_byte_perm_S (w5[0], w5[1], selector); + w5[1] = hc_byte_perm_S (w4[3], w5[0], selector); + w5[0] = hc_byte_perm_S (w4[2], w4[3], selector); + w4[3] = hc_byte_perm_S (w4[1], w4[2], selector); + w4[2] = hc_byte_perm_S (w4[0], w4[1], selector); + w4[1] = hc_byte_perm_S (w3[3], w4[0], selector); + w4[0] = hc_byte_perm_S (w3[2], w3[3], selector); + w3[3] = hc_byte_perm_S (w3[1], w3[2], selector); + w3[2] = hc_byte_perm_S (w3[0], w3[1], selector); + w3[1] = hc_byte_perm_S (w2[3], w3[0], selector); + w3[0] = hc_byte_perm_S (w2[2], w2[3], selector); + w2[3] = hc_byte_perm_S (w2[1], w2[2], selector); + w2[2] = hc_byte_perm_S (w2[0], w2[1], selector); + w2[1] = hc_byte_perm_S (w1[3], w2[0], selector); + w2[0] = hc_byte_perm_S (w1[2], w1[3], selector); + w1[3] = hc_byte_perm_S (w1[1], w1[2], selector); + w1[2] = hc_byte_perm_S (w1[0], w1[1], selector); + w1[1] = hc_byte_perm_S (w0[3], w1[0], selector); + w1[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w0[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w0[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w0[1] = hc_byte_perm_S ( 0, w0[0], selector); + w0[0] = 0; + + break; + + case 2: + c0[2] = hc_byte_perm_S (w7[3], 0, selector); + c0[1] = hc_byte_perm_S (w7[2], w7[3], selector); + c0[0] = hc_byte_perm_S (w7[1], w7[2], selector); + w7[3] = hc_byte_perm_S (w7[0], w7[1], selector); + w7[2] = hc_byte_perm_S (w6[3], w7[0], selector); + w7[1] = hc_byte_perm_S (w6[2], w6[3], selector); + w7[0] = hc_byte_perm_S (w6[1], w6[2], selector); + w6[3] = hc_byte_perm_S (w6[0], w6[1], selector); + w6[2] = hc_byte_perm_S (w5[3], w6[0], selector); + w6[1] = hc_byte_perm_S (w5[2], w5[3], selector); + w6[0] = hc_byte_perm_S (w5[1], w5[2], selector); + w5[3] = hc_byte_perm_S (w5[0], w5[1], selector); + w5[2] = hc_byte_perm_S (w4[3], w5[0], selector); + w5[1] = hc_byte_perm_S (w4[2], w4[3], selector); + w5[0] = hc_byte_perm_S (w4[1], w4[2], selector); + w4[3] = hc_byte_perm_S (w4[0], w4[1], selector); + w4[2] = hc_byte_perm_S (w3[3], w4[0], selector); + w4[1] = hc_byte_perm_S (w3[2], w3[3], selector); + w4[0] = hc_byte_perm_S (w3[1], w3[2], selector); + w3[3] = hc_byte_perm_S (w3[0], w3[1], selector); + w3[2] = hc_byte_perm_S (w2[3], w3[0], selector); + w3[1] = hc_byte_perm_S (w2[2], w2[3], selector); + w3[0] = hc_byte_perm_S (w2[1], w2[2], selector); + w2[3] = hc_byte_perm_S (w2[0], w2[1], selector); + w2[2] = hc_byte_perm_S (w1[3], w2[0], selector); + w2[1] = hc_byte_perm_S (w1[2], w1[3], selector); + w2[0] = hc_byte_perm_S (w1[1], w1[2], selector); + w1[3] = hc_byte_perm_S (w1[0], w1[1], selector); + w1[2] = hc_byte_perm_S (w0[3], w1[0], selector); + w1[1] = hc_byte_perm_S (w0[2], w0[3], selector); + w1[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w0[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w0[2] = hc_byte_perm_S ( 0, w0[0], selector); + w0[1] = 0; + w0[0] = 0; + + break; + + case 3: + c0[3] = hc_byte_perm_S (w7[3], 0, selector); + c0[2] = hc_byte_perm_S (w7[2], w7[3], selector); + c0[1] = hc_byte_perm_S (w7[1], w7[2], selector); + c0[0] = hc_byte_perm_S (w7[0], w7[1], selector); + w7[3] = hc_byte_perm_S (w6[3], w7[0], selector); + w7[2] = hc_byte_perm_S (w6[2], w6[3], selector); + w7[1] = hc_byte_perm_S (w6[1], w6[2], selector); + w7[0] = hc_byte_perm_S (w6[0], w6[1], selector); + w6[3] = hc_byte_perm_S (w5[3], w6[0], selector); + w6[2] = hc_byte_perm_S (w5[2], w5[3], selector); + w6[1] = hc_byte_perm_S (w5[1], w5[2], selector); + w6[0] = hc_byte_perm_S (w5[0], w5[1], selector); + w5[3] = hc_byte_perm_S (w4[3], w5[0], selector); + w5[2] = hc_byte_perm_S (w4[2], w4[3], selector); + w5[1] = hc_byte_perm_S (w4[1], w4[2], selector); + w5[0] = hc_byte_perm_S (w4[0], w4[1], selector); + w4[3] = hc_byte_perm_S (w3[3], w4[0], selector); + w4[2] = hc_byte_perm_S (w3[2], w3[3], selector); + w4[1] = hc_byte_perm_S (w3[1], w3[2], selector); + w4[0] = hc_byte_perm_S (w3[0], w3[1], selector); + w3[3] = hc_byte_perm_S (w2[3], w3[0], selector); + w3[2] = hc_byte_perm_S (w2[2], w2[3], selector); + w3[1] = hc_byte_perm_S (w2[1], w2[2], selector); + w3[0] = hc_byte_perm_S (w2[0], w2[1], selector); + w2[3] = hc_byte_perm_S (w1[3], w2[0], selector); + w2[2] = hc_byte_perm_S (w1[2], w1[3], selector); + w2[1] = hc_byte_perm_S (w1[1], w1[2], selector); + w2[0] = hc_byte_perm_S (w1[0], w1[1], selector); + w1[3] = hc_byte_perm_S (w0[3], w1[0], selector); + w1[2] = hc_byte_perm_S (w0[2], w0[3], selector); + w1[1] = hc_byte_perm_S (w0[1], w0[2], selector); + w1[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w0[3] = hc_byte_perm_S ( 0, w0[0], selector); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 4: + c1[0] = hc_byte_perm_S (w7[3], 0, selector); + c0[3] = hc_byte_perm_S (w7[2], w7[3], selector); + c0[2] = hc_byte_perm_S (w7[1], w7[2], selector); + c0[1] = hc_byte_perm_S (w7[0], w7[1], selector); + c0[0] = hc_byte_perm_S (w6[3], w7[0], selector); + w7[3] = hc_byte_perm_S (w6[2], w6[3], selector); + w7[2] = hc_byte_perm_S (w6[1], w6[2], selector); + w7[1] = hc_byte_perm_S (w6[0], w6[1], selector); + w7[0] = hc_byte_perm_S (w5[3], w6[0], selector); + w6[3] = hc_byte_perm_S (w5[2], w5[3], selector); + w6[2] = hc_byte_perm_S (w5[1], w5[2], selector); + w6[1] = hc_byte_perm_S (w5[0], w5[1], selector); + w6[0] = hc_byte_perm_S (w4[3], w5[0], selector); + w5[3] = hc_byte_perm_S (w4[2], w4[3], selector); + w5[2] = hc_byte_perm_S (w4[1], w4[2], selector); + w5[1] = hc_byte_perm_S (w4[0], w4[1], selector); + w5[0] = hc_byte_perm_S (w3[3], w4[0], selector); + w4[3] = hc_byte_perm_S (w3[2], w3[3], selector); + w4[2] = hc_byte_perm_S (w3[1], w3[2], selector); + w4[1] = hc_byte_perm_S (w3[0], w3[1], selector); + w4[0] = hc_byte_perm_S (w2[3], w3[0], selector); + w3[3] = hc_byte_perm_S (w2[2], w2[3], selector); + w3[2] = hc_byte_perm_S (w2[1], w2[2], selector); + w3[1] = hc_byte_perm_S (w2[0], w2[1], selector); + w3[0] = hc_byte_perm_S (w1[3], w2[0], selector); + w2[3] = hc_byte_perm_S (w1[2], w1[3], selector); + w2[2] = hc_byte_perm_S (w1[1], w1[2], selector); + w2[1] = hc_byte_perm_S (w1[0], w1[1], selector); + w2[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w1[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w1[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w1[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w1[0] = hc_byte_perm_S ( 0, w0[0], selector); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 5: + c1[1] = hc_byte_perm_S (w7[3], 0, selector); + c1[0] = hc_byte_perm_S (w7[2], w7[3], selector); + c0[3] = hc_byte_perm_S (w7[1], w7[2], selector); + c0[2] = hc_byte_perm_S (w7[0], w7[1], selector); + c0[1] = hc_byte_perm_S (w6[3], w7[0], selector); + c0[0] = hc_byte_perm_S (w6[2], w6[3], selector); + w7[3] = hc_byte_perm_S (w6[1], w6[2], selector); + w7[2] = hc_byte_perm_S (w6[0], w6[1], selector); + w7[1] = hc_byte_perm_S (w5[3], w6[0], selector); + w7[0] = hc_byte_perm_S (w5[2], w5[3], selector); + w6[3] = hc_byte_perm_S (w5[1], w5[2], selector); + w6[2] = hc_byte_perm_S (w5[0], w5[1], selector); + w6[1] = hc_byte_perm_S (w4[3], w5[0], selector); + w6[0] = hc_byte_perm_S (w4[2], w4[3], selector); + w5[3] = hc_byte_perm_S (w4[1], w4[2], selector); + w5[2] = hc_byte_perm_S (w4[0], w4[1], selector); + w5[1] = hc_byte_perm_S (w3[3], w4[0], selector); + w5[0] = hc_byte_perm_S (w3[2], w3[3], selector); + w4[3] = hc_byte_perm_S (w3[1], w3[2], selector); + w4[2] = hc_byte_perm_S (w3[0], w3[1], selector); + w4[1] = hc_byte_perm_S (w2[3], w3[0], selector); + w4[0] = hc_byte_perm_S (w2[2], w2[3], selector); + w3[3] = hc_byte_perm_S (w2[1], w2[2], selector); + w3[2] = hc_byte_perm_S (w2[0], w2[1], selector); + w3[1] = hc_byte_perm_S (w1[3], w2[0], selector); + w3[0] = hc_byte_perm_S (w1[2], w1[3], selector); + w2[3] = hc_byte_perm_S (w1[1], w1[2], selector); + w2[2] = hc_byte_perm_S (w1[0], w1[1], selector); + w2[1] = hc_byte_perm_S (w0[3], w1[0], selector); + w2[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w1[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w1[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w1[1] = hc_byte_perm_S ( 0, w0[0], selector); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 6: + c1[2] = hc_byte_perm_S (w7[3], 0, selector); + c1[1] = hc_byte_perm_S (w7[2], w7[3], selector); + c1[0] = hc_byte_perm_S (w7[1], w7[2], selector); + c0[3] = hc_byte_perm_S (w7[0], w7[1], selector); + c0[2] = hc_byte_perm_S (w6[3], w7[0], selector); + c0[1] = hc_byte_perm_S (w6[2], w6[3], selector); + c0[0] = hc_byte_perm_S (w6[1], w6[2], selector); + w7[3] = hc_byte_perm_S (w6[0], w6[1], selector); + w7[2] = hc_byte_perm_S (w5[3], w6[0], selector); + w7[1] = hc_byte_perm_S (w5[2], w5[3], selector); + w7[0] = hc_byte_perm_S (w5[1], w5[2], selector); + w6[3] = hc_byte_perm_S (w5[0], w5[1], selector); + w6[2] = hc_byte_perm_S (w4[3], w5[0], selector); + w6[1] = hc_byte_perm_S (w4[2], w4[3], selector); + w6[0] = hc_byte_perm_S (w4[1], w4[2], selector); + w5[3] = hc_byte_perm_S (w4[0], w4[1], selector); + w5[2] = hc_byte_perm_S (w3[3], w4[0], selector); + w5[1] = hc_byte_perm_S (w3[2], w3[3], selector); + w5[0] = hc_byte_perm_S (w3[1], w3[2], selector); + w4[3] = hc_byte_perm_S (w3[0], w3[1], selector); + w4[2] = hc_byte_perm_S (w2[3], w3[0], selector); + w4[1] = hc_byte_perm_S (w2[2], w2[3], selector); + w4[0] = hc_byte_perm_S (w2[1], w2[2], selector); + w3[3] = hc_byte_perm_S (w2[0], w2[1], selector); + w3[2] = hc_byte_perm_S (w1[3], w2[0], selector); + w3[1] = hc_byte_perm_S (w1[2], w1[3], selector); + w3[0] = hc_byte_perm_S (w1[1], w1[2], selector); + w2[3] = hc_byte_perm_S (w1[0], w1[1], selector); + w2[2] = hc_byte_perm_S (w0[3], w1[0], selector); + w2[1] = hc_byte_perm_S (w0[2], w0[3], selector); + w2[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w1[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w1[2] = hc_byte_perm_S ( 0, w0[0], selector); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 7: + c1[3] = hc_byte_perm_S (w7[3], 0, selector); + c1[2] = hc_byte_perm_S (w7[2], w7[3], selector); + c1[1] = hc_byte_perm_S (w7[1], w7[2], selector); + c1[0] = hc_byte_perm_S (w7[0], w7[1], selector); + c0[3] = hc_byte_perm_S (w6[3], w7[0], selector); + c0[2] = hc_byte_perm_S (w6[2], w6[3], selector); + c0[1] = hc_byte_perm_S (w6[1], w6[2], selector); + c0[0] = hc_byte_perm_S (w6[0], w6[1], selector); + w7[3] = hc_byte_perm_S (w5[3], w6[0], selector); + w7[2] = hc_byte_perm_S (w5[2], w5[3], selector); + w7[1] = hc_byte_perm_S (w5[1], w5[2], selector); + w7[0] = hc_byte_perm_S (w5[0], w5[1], selector); + w6[3] = hc_byte_perm_S (w4[3], w5[0], selector); + w6[2] = hc_byte_perm_S (w4[2], w4[3], selector); + w6[1] = hc_byte_perm_S (w4[1], w4[2], selector); + w6[0] = hc_byte_perm_S (w4[0], w4[1], selector); + w5[3] = hc_byte_perm_S (w3[3], w4[0], selector); + w5[2] = hc_byte_perm_S (w3[2], w3[3], selector); + w5[1] = hc_byte_perm_S (w3[1], w3[2], selector); + w5[0] = hc_byte_perm_S (w3[0], w3[1], selector); + w4[3] = hc_byte_perm_S (w2[3], w3[0], selector); + w4[2] = hc_byte_perm_S (w2[2], w2[3], selector); + w4[1] = hc_byte_perm_S (w2[1], w2[2], selector); + w4[0] = hc_byte_perm_S (w2[0], w2[1], selector); + w3[3] = hc_byte_perm_S (w1[3], w2[0], selector); + w3[2] = hc_byte_perm_S (w1[2], w1[3], selector); + w3[1] = hc_byte_perm_S (w1[1], w1[2], selector); + w3[0] = hc_byte_perm_S (w1[0], w1[1], selector); + w2[3] = hc_byte_perm_S (w0[3], w1[0], selector); + w2[2] = hc_byte_perm_S (w0[2], w0[3], selector); + w2[1] = hc_byte_perm_S (w0[1], w0[2], selector); + w2[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w1[3] = hc_byte_perm_S ( 0, w0[0], selector); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 8: + c2[0] = hc_byte_perm_S (w7[3], 0, selector); + c1[3] = hc_byte_perm_S (w7[2], w7[3], selector); + c1[2] = hc_byte_perm_S (w7[1], w7[2], selector); + c1[1] = hc_byte_perm_S (w7[0], w7[1], selector); + c1[0] = hc_byte_perm_S (w6[3], w7[0], selector); + c0[3] = hc_byte_perm_S (w6[2], w6[3], selector); + c0[2] = hc_byte_perm_S (w6[1], w6[2], selector); + c0[1] = hc_byte_perm_S (w6[0], w6[1], selector); + c0[0] = hc_byte_perm_S (w5[3], w6[0], selector); + w7[3] = hc_byte_perm_S (w5[2], w5[3], selector); + w7[2] = hc_byte_perm_S (w5[1], w5[2], selector); + w7[1] = hc_byte_perm_S (w5[0], w5[1], selector); + w7[0] = hc_byte_perm_S (w4[3], w5[0], selector); + w6[3] = hc_byte_perm_S (w4[2], w4[3], selector); + w6[2] = hc_byte_perm_S (w4[1], w4[2], selector); + w6[1] = hc_byte_perm_S (w4[0], w4[1], selector); + w6[0] = hc_byte_perm_S (w3[3], w4[0], selector); + w5[3] = hc_byte_perm_S (w3[2], w3[3], selector); + w5[2] = hc_byte_perm_S (w3[1], w3[2], selector); + w5[1] = hc_byte_perm_S (w3[0], w3[1], selector); + w5[0] = hc_byte_perm_S (w2[3], w3[0], selector); + w4[3] = hc_byte_perm_S (w2[2], w2[3], selector); + w4[2] = hc_byte_perm_S (w2[1], w2[2], selector); + w4[1] = hc_byte_perm_S (w2[0], w2[1], selector); + w4[0] = hc_byte_perm_S (w1[3], w2[0], selector); + w3[3] = hc_byte_perm_S (w1[2], w1[3], selector); + w3[2] = hc_byte_perm_S (w1[1], w1[2], selector); + w3[1] = hc_byte_perm_S (w1[0], w1[1], selector); + w3[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w2[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w2[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w2[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w2[0] = hc_byte_perm_S ( 0, w0[0], selector); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 9: + c2[1] = hc_byte_perm_S (w7[3], 0, selector); + c2[0] = hc_byte_perm_S (w7[2], w7[3], selector); + c1[3] = hc_byte_perm_S (w7[1], w7[2], selector); + c1[2] = hc_byte_perm_S (w7[0], w7[1], selector); + c1[1] = hc_byte_perm_S (w6[3], w7[0], selector); + c1[0] = hc_byte_perm_S (w6[2], w6[3], selector); + c0[3] = hc_byte_perm_S (w6[1], w6[2], selector); + c0[2] = hc_byte_perm_S (w6[0], w6[1], selector); + c0[1] = hc_byte_perm_S (w5[3], w6[0], selector); + c0[0] = hc_byte_perm_S (w5[2], w5[3], selector); + w7[3] = hc_byte_perm_S (w5[1], w5[2], selector); + w7[2] = hc_byte_perm_S (w5[0], w5[1], selector); + w7[1] = hc_byte_perm_S (w4[3], w5[0], selector); + w7[0] = hc_byte_perm_S (w4[2], w4[3], selector); + w6[3] = hc_byte_perm_S (w4[1], w4[2], selector); + w6[2] = hc_byte_perm_S (w4[0], w4[1], selector); + w6[1] = hc_byte_perm_S (w3[3], w4[0], selector); + w6[0] = hc_byte_perm_S (w3[2], w3[3], selector); + w5[3] = hc_byte_perm_S (w3[1], w3[2], selector); + w5[2] = hc_byte_perm_S (w3[0], w3[1], selector); + w5[1] = hc_byte_perm_S (w2[3], w3[0], selector); + w5[0] = hc_byte_perm_S (w2[2], w2[3], selector); + w4[3] = hc_byte_perm_S (w2[1], w2[2], selector); + w4[2] = hc_byte_perm_S (w2[0], w2[1], selector); + w4[1] = hc_byte_perm_S (w1[3], w2[0], selector); + w4[0] = hc_byte_perm_S (w1[2], w1[3], selector); + w3[3] = hc_byte_perm_S (w1[1], w1[2], selector); + w3[2] = hc_byte_perm_S (w1[0], w1[1], selector); + w3[1] = hc_byte_perm_S (w0[3], w1[0], selector); + w3[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w2[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w2[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w2[1] = hc_byte_perm_S ( 0, w0[0], selector); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 10: + c2[2] = hc_byte_perm_S (w7[3], 0, selector); + c2[1] = hc_byte_perm_S (w7[2], w7[3], selector); + c2[0] = hc_byte_perm_S (w7[1], w7[2], selector); + c1[3] = hc_byte_perm_S (w7[0], w7[1], selector); + c1[2] = hc_byte_perm_S (w6[3], w7[0], selector); + c1[1] = hc_byte_perm_S (w6[2], w6[3], selector); + c1[0] = hc_byte_perm_S (w6[1], w6[2], selector); + c0[3] = hc_byte_perm_S (w6[0], w6[1], selector); + c0[2] = hc_byte_perm_S (w5[3], w6[0], selector); + c0[1] = hc_byte_perm_S (w5[2], w5[3], selector); + c0[0] = hc_byte_perm_S (w5[1], w5[2], selector); + w7[3] = hc_byte_perm_S (w5[0], w5[1], selector); + w7[2] = hc_byte_perm_S (w4[3], w5[0], selector); + w7[1] = hc_byte_perm_S (w4[2], w4[3], selector); + w7[0] = hc_byte_perm_S (w4[1], w4[2], selector); + w6[3] = hc_byte_perm_S (w4[0], w4[1], selector); + w6[2] = hc_byte_perm_S (w3[3], w4[0], selector); + w6[1] = hc_byte_perm_S (w3[2], w3[3], selector); + w6[0] = hc_byte_perm_S (w3[1], w3[2], selector); + w5[3] = hc_byte_perm_S (w3[0], w3[1], selector); + w5[2] = hc_byte_perm_S (w2[3], w3[0], selector); + w5[1] = hc_byte_perm_S (w2[2], w2[3], selector); + w5[0] = hc_byte_perm_S (w2[1], w2[2], selector); + w4[3] = hc_byte_perm_S (w2[0], w2[1], selector); + w4[2] = hc_byte_perm_S (w1[3], w2[0], selector); + w4[1] = hc_byte_perm_S (w1[2], w1[3], selector); + w4[0] = hc_byte_perm_S (w1[1], w1[2], selector); + w3[3] = hc_byte_perm_S (w1[0], w1[1], selector); + w3[2] = hc_byte_perm_S (w0[3], w1[0], selector); + w3[1] = hc_byte_perm_S (w0[2], w0[3], selector); + w3[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w2[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w2[2] = hc_byte_perm_S ( 0, w0[0], selector); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 11: + c2[3] = hc_byte_perm_S (w7[3], 0, selector); + c2[2] = hc_byte_perm_S (w7[2], w7[3], selector); + c2[1] = hc_byte_perm_S (w7[1], w7[2], selector); + c2[0] = hc_byte_perm_S (w7[0], w7[1], selector); + c1[3] = hc_byte_perm_S (w6[3], w7[0], selector); + c1[2] = hc_byte_perm_S (w6[2], w6[3], selector); + c1[1] = hc_byte_perm_S (w6[1], w6[2], selector); + c1[0] = hc_byte_perm_S (w6[0], w6[1], selector); + c0[3] = hc_byte_perm_S (w5[3], w6[0], selector); + c0[2] = hc_byte_perm_S (w5[2], w5[3], selector); + c0[1] = hc_byte_perm_S (w5[1], w5[2], selector); + c0[0] = hc_byte_perm_S (w5[0], w5[1], selector); + w7[3] = hc_byte_perm_S (w4[3], w5[0], selector); + w7[2] = hc_byte_perm_S (w4[2], w4[3], selector); + w7[1] = hc_byte_perm_S (w4[1], w4[2], selector); + w7[0] = hc_byte_perm_S (w4[0], w4[1], selector); + w6[3] = hc_byte_perm_S (w3[3], w4[0], selector); + w6[2] = hc_byte_perm_S (w3[2], w3[3], selector); + w6[1] = hc_byte_perm_S (w3[1], w3[2], selector); + w6[0] = hc_byte_perm_S (w3[0], w3[1], selector); + w5[3] = hc_byte_perm_S (w2[3], w3[0], selector); + w5[2] = hc_byte_perm_S (w2[2], w2[3], selector); + w5[1] = hc_byte_perm_S (w2[1], w2[2], selector); + w5[0] = hc_byte_perm_S (w2[0], w2[1], selector); + w4[3] = hc_byte_perm_S (w1[3], w2[0], selector); + w4[2] = hc_byte_perm_S (w1[2], w1[3], selector); + w4[1] = hc_byte_perm_S (w1[1], w1[2], selector); + w4[0] = hc_byte_perm_S (w1[0], w1[1], selector); + w3[3] = hc_byte_perm_S (w0[3], w1[0], selector); + w3[2] = hc_byte_perm_S (w0[2], w0[3], selector); + w3[1] = hc_byte_perm_S (w0[1], w0[2], selector); + w3[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w2[3] = hc_byte_perm_S ( 0, w0[0], selector); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 12: + c3[0] = hc_byte_perm_S (w7[3], 0, selector); + c2[3] = hc_byte_perm_S (w7[2], w7[3], selector); + c2[2] = hc_byte_perm_S (w7[1], w7[2], selector); + c2[1] = hc_byte_perm_S (w7[0], w7[1], selector); + c2[0] = hc_byte_perm_S (w6[3], w7[0], selector); + c1[3] = hc_byte_perm_S (w6[2], w6[3], selector); + c1[2] = hc_byte_perm_S (w6[1], w6[2], selector); + c1[1] = hc_byte_perm_S (w6[0], w6[1], selector); + c1[0] = hc_byte_perm_S (w5[3], w6[0], selector); + c0[3] = hc_byte_perm_S (w5[2], w5[3], selector); + c0[2] = hc_byte_perm_S (w5[1], w5[2], selector); + c0[1] = hc_byte_perm_S (w5[0], w5[1], selector); + c0[0] = hc_byte_perm_S (w4[3], w5[0], selector); + w7[3] = hc_byte_perm_S (w4[2], w4[3], selector); + w7[2] = hc_byte_perm_S (w4[1], w4[2], selector); + w7[1] = hc_byte_perm_S (w4[0], w4[1], selector); + w7[0] = hc_byte_perm_S (w3[3], w4[0], selector); + w6[3] = hc_byte_perm_S (w3[2], w3[3], selector); + w6[2] = hc_byte_perm_S (w3[1], w3[2], selector); + w6[1] = hc_byte_perm_S (w3[0], w3[1], selector); + w6[0] = hc_byte_perm_S (w2[3], w3[0], selector); + w5[3] = hc_byte_perm_S (w2[2], w2[3], selector); + w5[2] = hc_byte_perm_S (w2[1], w2[2], selector); + w5[1] = hc_byte_perm_S (w2[0], w2[1], selector); + w5[0] = hc_byte_perm_S (w1[3], w2[0], selector); + w4[3] = hc_byte_perm_S (w1[2], w1[3], selector); + w4[2] = hc_byte_perm_S (w1[1], w1[2], selector); + w4[1] = hc_byte_perm_S (w1[0], w1[1], selector); + w4[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w3[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w3[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w3[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w3[0] = hc_byte_perm_S ( 0, w0[0], selector); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 13: + c3[1] = hc_byte_perm_S (w7[3], 0, selector); + c3[0] = hc_byte_perm_S (w7[2], w7[3], selector); + c2[3] = hc_byte_perm_S (w7[1], w7[2], selector); + c2[2] = hc_byte_perm_S (w7[0], w7[1], selector); + c2[1] = hc_byte_perm_S (w6[3], w7[0], selector); + c2[0] = hc_byte_perm_S (w6[2], w6[3], selector); + c1[3] = hc_byte_perm_S (w6[1], w6[2], selector); + c1[2] = hc_byte_perm_S (w6[0], w6[1], selector); + c1[1] = hc_byte_perm_S (w5[3], w6[0], selector); + c1[0] = hc_byte_perm_S (w5[2], w5[3], selector); + c0[3] = hc_byte_perm_S (w5[1], w5[2], selector); + c0[2] = hc_byte_perm_S (w5[0], w5[1], selector); + c0[1] = hc_byte_perm_S (w4[3], w5[0], selector); + c0[0] = hc_byte_perm_S (w4[2], w4[3], selector); + w7[3] = hc_byte_perm_S (w4[1], w4[2], selector); + w7[2] = hc_byte_perm_S (w4[0], w4[1], selector); + w7[1] = hc_byte_perm_S (w3[3], w4[0], selector); + w7[0] = hc_byte_perm_S (w3[2], w3[3], selector); + w6[3] = hc_byte_perm_S (w3[1], w3[2], selector); + w6[2] = hc_byte_perm_S (w3[0], w3[1], selector); + w6[1] = hc_byte_perm_S (w2[3], w3[0], selector); + w6[0] = hc_byte_perm_S (w2[2], w2[3], selector); + w5[3] = hc_byte_perm_S (w2[1], w2[2], selector); + w5[2] = hc_byte_perm_S (w2[0], w2[1], selector); + w5[1] = hc_byte_perm_S (w1[3], w2[0], selector); + w5[0] = hc_byte_perm_S (w1[2], w1[3], selector); + w4[3] = hc_byte_perm_S (w1[1], w1[2], selector); + w4[2] = hc_byte_perm_S (w1[0], w1[1], selector); + w4[1] = hc_byte_perm_S (w0[3], w1[0], selector); + w4[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w3[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w3[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w3[1] = hc_byte_perm_S ( 0, w0[0], selector); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 14: + c3[2] = hc_byte_perm_S (w7[3], 0, selector); + c3[1] = hc_byte_perm_S (w7[2], w7[3], selector); + c3[0] = hc_byte_perm_S (w7[1], w7[2], selector); + c2[3] = hc_byte_perm_S (w7[0], w7[1], selector); + c2[2] = hc_byte_perm_S (w6[3], w7[0], selector); + c2[1] = hc_byte_perm_S (w6[2], w6[3], selector); + c2[0] = hc_byte_perm_S (w6[1], w6[2], selector); + c1[3] = hc_byte_perm_S (w6[0], w6[1], selector); + c1[2] = hc_byte_perm_S (w5[3], w6[0], selector); + c1[1] = hc_byte_perm_S (w5[2], w5[3], selector); + c1[0] = hc_byte_perm_S (w5[1], w5[2], selector); + c0[3] = hc_byte_perm_S (w5[0], w5[1], selector); + c0[2] = hc_byte_perm_S (w4[3], w5[0], selector); + c0[1] = hc_byte_perm_S (w4[2], w4[3], selector); + c0[0] = hc_byte_perm_S (w4[1], w4[2], selector); + w7[3] = hc_byte_perm_S (w4[0], w4[1], selector); + w7[2] = hc_byte_perm_S (w3[3], w4[0], selector); + w7[1] = hc_byte_perm_S (w3[2], w3[3], selector); + w7[0] = hc_byte_perm_S (w3[1], w3[2], selector); + w6[3] = hc_byte_perm_S (w3[0], w3[1], selector); + w6[2] = hc_byte_perm_S (w2[3], w3[0], selector); + w6[1] = hc_byte_perm_S (w2[2], w2[3], selector); + w6[0] = hc_byte_perm_S (w2[1], w2[2], selector); + w5[3] = hc_byte_perm_S (w2[0], w2[1], selector); + w5[2] = hc_byte_perm_S (w1[3], w2[0], selector); + w5[1] = hc_byte_perm_S (w1[2], w1[3], selector); + w5[0] = hc_byte_perm_S (w1[1], w1[2], selector); + w4[3] = hc_byte_perm_S (w1[0], w1[1], selector); + w4[2] = hc_byte_perm_S (w0[3], w1[0], selector); + w4[1] = hc_byte_perm_S (w0[2], w0[3], selector); + w4[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w3[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w3[2] = hc_byte_perm_S ( 0, w0[0], selector); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 15: + c3[3] = hc_byte_perm_S (w7[3], 0, selector); + c3[2] = hc_byte_perm_S (w7[2], w7[3], selector); + c3[1] = hc_byte_perm_S (w7[1], w7[2], selector); + c3[0] = hc_byte_perm_S (w7[0], w7[1], selector); + c2[3] = hc_byte_perm_S (w6[3], w7[0], selector); + c2[2] = hc_byte_perm_S (w6[2], w6[3], selector); + c2[1] = hc_byte_perm_S (w6[1], w6[2], selector); + c2[0] = hc_byte_perm_S (w6[0], w6[1], selector); + c1[3] = hc_byte_perm_S (w5[3], w6[0], selector); + c1[2] = hc_byte_perm_S (w5[2], w5[3], selector); + c1[1] = hc_byte_perm_S (w5[1], w5[2], selector); + c1[0] = hc_byte_perm_S (w5[0], w5[1], selector); + c0[3] = hc_byte_perm_S (w4[3], w5[0], selector); + c0[2] = hc_byte_perm_S (w4[2], w4[3], selector); + c0[1] = hc_byte_perm_S (w4[1], w4[2], selector); + c0[0] = hc_byte_perm_S (w4[0], w4[1], selector); + w7[3] = hc_byte_perm_S (w3[3], w4[0], selector); + w7[2] = hc_byte_perm_S (w3[2], w3[3], selector); + w7[1] = hc_byte_perm_S (w3[1], w3[2], selector); + w7[0] = hc_byte_perm_S (w3[0], w3[1], selector); + w6[3] = hc_byte_perm_S (w2[3], w3[0], selector); + w6[2] = hc_byte_perm_S (w2[2], w2[3], selector); + w6[1] = hc_byte_perm_S (w2[1], w2[2], selector); + w6[0] = hc_byte_perm_S (w2[0], w2[1], selector); + w5[3] = hc_byte_perm_S (w1[3], w2[0], selector); + w5[2] = hc_byte_perm_S (w1[2], w1[3], selector); + w5[1] = hc_byte_perm_S (w1[1], w1[2], selector); + w5[0] = hc_byte_perm_S (w1[0], w1[1], selector); + w4[3] = hc_byte_perm_S (w0[3], w1[0], selector); + w4[2] = hc_byte_perm_S (w0[2], w0[3], selector); + w4[1] = hc_byte_perm_S (w0[1], w0[2], selector); + w4[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w3[3] = hc_byte_perm_S ( 0, w0[0], selector); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 16: + c4[0] = hc_byte_perm_S (w7[3], 0, selector); + c3[3] = hc_byte_perm_S (w7[2], w7[3], selector); + c3[2] = hc_byte_perm_S (w7[1], w7[2], selector); + c3[1] = hc_byte_perm_S (w7[0], w7[1], selector); + c3[0] = hc_byte_perm_S (w6[3], w7[0], selector); + c2[3] = hc_byte_perm_S (w6[2], w6[3], selector); + c2[2] = hc_byte_perm_S (w6[1], w6[2], selector); + c2[1] = hc_byte_perm_S (w6[0], w6[1], selector); + c2[0] = hc_byte_perm_S (w5[3], w6[0], selector); + c1[3] = hc_byte_perm_S (w5[2], w5[3], selector); + c1[2] = hc_byte_perm_S (w5[1], w5[2], selector); + c1[1] = hc_byte_perm_S (w5[0], w5[1], selector); + c1[0] = hc_byte_perm_S (w4[3], w5[0], selector); + c0[3] = hc_byte_perm_S (w4[2], w4[3], selector); + c0[2] = hc_byte_perm_S (w4[1], w4[2], selector); + c0[1] = hc_byte_perm_S (w4[0], w4[1], selector); + c0[0] = hc_byte_perm_S (w3[3], w4[0], selector); + w7[3] = hc_byte_perm_S (w3[2], w3[3], selector); + w7[2] = hc_byte_perm_S (w3[1], w3[2], selector); + w7[1] = hc_byte_perm_S (w3[0], w3[1], selector); + w7[0] = hc_byte_perm_S (w2[3], w3[0], selector); + w6[3] = hc_byte_perm_S (w2[2], w2[3], selector); + w6[2] = hc_byte_perm_S (w2[1], w2[2], selector); + w6[1] = hc_byte_perm_S (w2[0], w2[1], selector); + w6[0] = hc_byte_perm_S (w1[3], w2[0], selector); + w5[3] = hc_byte_perm_S (w1[2], w1[3], selector); + w5[2] = hc_byte_perm_S (w1[1], w1[2], selector); + w5[1] = hc_byte_perm_S (w1[0], w1[1], selector); + w5[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w4[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w4[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w4[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w4[0] = hc_byte_perm_S ( 0, w0[0], selector); + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 17: + c4[1] = hc_byte_perm_S (w7[3], 0, selector); + c4[0] = hc_byte_perm_S (w7[2], w7[3], selector); + c3[3] = hc_byte_perm_S (w7[1], w7[2], selector); + c3[2] = hc_byte_perm_S (w7[0], w7[1], selector); + c3[1] = hc_byte_perm_S (w6[3], w7[0], selector); + c3[0] = hc_byte_perm_S (w6[2], w6[3], selector); + c2[3] = hc_byte_perm_S (w6[1], w6[2], selector); + c2[2] = hc_byte_perm_S (w6[0], w6[1], selector); + c2[1] = hc_byte_perm_S (w5[3], w6[0], selector); + c2[0] = hc_byte_perm_S (w5[2], w5[3], selector); + c1[3] = hc_byte_perm_S (w5[1], w5[2], selector); + c1[2] = hc_byte_perm_S (w5[0], w5[1], selector); + c1[1] = hc_byte_perm_S (w4[3], w5[0], selector); + c1[0] = hc_byte_perm_S (w4[2], w4[3], selector); + c0[3] = hc_byte_perm_S (w4[1], w4[2], selector); + c0[2] = hc_byte_perm_S (w4[0], w4[1], selector); + c0[1] = hc_byte_perm_S (w3[3], w4[0], selector); + c0[0] = hc_byte_perm_S (w3[2], w3[3], selector); + w7[3] = hc_byte_perm_S (w3[1], w3[2], selector); + w7[2] = hc_byte_perm_S (w3[0], w3[1], selector); + w7[1] = hc_byte_perm_S (w2[3], w3[0], selector); + w7[0] = hc_byte_perm_S (w2[2], w2[3], selector); + w6[3] = hc_byte_perm_S (w2[1], w2[2], selector); + w6[2] = hc_byte_perm_S (w2[0], w2[1], selector); + w6[1] = hc_byte_perm_S (w1[3], w2[0], selector); + w6[0] = hc_byte_perm_S (w1[2], w1[3], selector); + w5[3] = hc_byte_perm_S (w1[1], w1[2], selector); + w5[2] = hc_byte_perm_S (w1[0], w1[1], selector); + w5[1] = hc_byte_perm_S (w0[3], w1[0], selector); + w5[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w4[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w4[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w4[1] = hc_byte_perm_S ( 0, w0[0], selector); + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 18: + c4[2] = hc_byte_perm_S (w7[3], 0, selector); + c4[1] = hc_byte_perm_S (w7[2], w7[3], selector); + c4[0] = hc_byte_perm_S (w7[1], w7[2], selector); + c3[3] = hc_byte_perm_S (w7[0], w7[1], selector); + c3[2] = hc_byte_perm_S (w6[3], w7[0], selector); + c3[1] = hc_byte_perm_S (w6[2], w6[3], selector); + c3[0] = hc_byte_perm_S (w6[1], w6[2], selector); + c2[3] = hc_byte_perm_S (w6[0], w6[1], selector); + c2[2] = hc_byte_perm_S (w5[3], w6[0], selector); + c2[1] = hc_byte_perm_S (w5[2], w5[3], selector); + c2[0] = hc_byte_perm_S (w5[1], w5[2], selector); + c1[3] = hc_byte_perm_S (w5[0], w5[1], selector); + c1[2] = hc_byte_perm_S (w4[3], w5[0], selector); + c1[1] = hc_byte_perm_S (w4[2], w4[3], selector); + c1[0] = hc_byte_perm_S (w4[1], w4[2], selector); + c0[3] = hc_byte_perm_S (w4[0], w4[1], selector); + c0[2] = hc_byte_perm_S (w3[3], w4[0], selector); + c0[1] = hc_byte_perm_S (w3[2], w3[3], selector); + c0[0] = hc_byte_perm_S (w3[1], w3[2], selector); + w7[3] = hc_byte_perm_S (w3[0], w3[1], selector); + w7[2] = hc_byte_perm_S (w2[3], w3[0], selector); + w7[1] = hc_byte_perm_S (w2[2], w2[3], selector); + w7[0] = hc_byte_perm_S (w2[1], w2[2], selector); + w6[3] = hc_byte_perm_S (w2[0], w2[1], selector); + w6[2] = hc_byte_perm_S (w1[3], w2[0], selector); + w6[1] = hc_byte_perm_S (w1[2], w1[3], selector); + w6[0] = hc_byte_perm_S (w1[1], w1[2], selector); + w5[3] = hc_byte_perm_S (w1[0], w1[1], selector); + w5[2] = hc_byte_perm_S (w0[3], w1[0], selector); + w5[1] = hc_byte_perm_S (w0[2], w0[3], selector); + w5[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w4[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w4[2] = hc_byte_perm_S ( 0, w0[0], selector); + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 19: + c4[3] = hc_byte_perm_S (w7[3], 0, selector); + c4[2] = hc_byte_perm_S (w7[2], w7[3], selector); + c4[1] = hc_byte_perm_S (w7[1], w7[2], selector); + c4[0] = hc_byte_perm_S (w7[0], w7[1], selector); + c3[3] = hc_byte_perm_S (w6[3], w7[0], selector); + c3[2] = hc_byte_perm_S (w6[2], w6[3], selector); + c3[1] = hc_byte_perm_S (w6[1], w6[2], selector); + c3[0] = hc_byte_perm_S (w6[0], w6[1], selector); + c2[3] = hc_byte_perm_S (w5[3], w6[0], selector); + c2[2] = hc_byte_perm_S (w5[2], w5[3], selector); + c2[1] = hc_byte_perm_S (w5[1], w5[2], selector); + c2[0] = hc_byte_perm_S (w5[0], w5[1], selector); + c1[3] = hc_byte_perm_S (w4[3], w5[0], selector); + c1[2] = hc_byte_perm_S (w4[2], w4[3], selector); + c1[1] = hc_byte_perm_S (w4[1], w4[2], selector); + c1[0] = hc_byte_perm_S (w4[0], w4[1], selector); + c0[3] = hc_byte_perm_S (w3[3], w4[0], selector); + c0[2] = hc_byte_perm_S (w3[2], w3[3], selector); + c0[1] = hc_byte_perm_S (w3[1], w3[2], selector); + c0[0] = hc_byte_perm_S (w3[0], w3[1], selector); + w7[3] = hc_byte_perm_S (w2[3], w3[0], selector); + w7[2] = hc_byte_perm_S (w2[2], w2[3], selector); + w7[1] = hc_byte_perm_S (w2[1], w2[2], selector); + w7[0] = hc_byte_perm_S (w2[0], w2[1], selector); + w6[3] = hc_byte_perm_S (w1[3], w2[0], selector); + w6[2] = hc_byte_perm_S (w1[2], w1[3], selector); + w6[1] = hc_byte_perm_S (w1[1], w1[2], selector); + w6[0] = hc_byte_perm_S (w1[0], w1[1], selector); + w5[3] = hc_byte_perm_S (w0[3], w1[0], selector); + w5[2] = hc_byte_perm_S (w0[2], w0[3], selector); + w5[1] = hc_byte_perm_S (w0[1], w0[2], selector); + w5[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w4[3] = hc_byte_perm_S ( 0, w0[0], selector); + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 20: + c5[0] = hc_byte_perm_S (w7[3], 0, selector); + c4[3] = hc_byte_perm_S (w7[2], w7[3], selector); + c4[2] = hc_byte_perm_S (w7[1], w7[2], selector); + c4[1] = hc_byte_perm_S (w7[0], w7[1], selector); + c4[0] = hc_byte_perm_S (w6[3], w7[0], selector); + c3[3] = hc_byte_perm_S (w6[2], w6[3], selector); + c3[2] = hc_byte_perm_S (w6[1], w6[2], selector); + c3[1] = hc_byte_perm_S (w6[0], w6[1], selector); + c3[0] = hc_byte_perm_S (w5[3], w6[0], selector); + c2[3] = hc_byte_perm_S (w5[2], w5[3], selector); + c2[2] = hc_byte_perm_S (w5[1], w5[2], selector); + c2[1] = hc_byte_perm_S (w5[0], w5[1], selector); + c2[0] = hc_byte_perm_S (w4[3], w5[0], selector); + c1[3] = hc_byte_perm_S (w4[2], w4[3], selector); + c1[2] = hc_byte_perm_S (w4[1], w4[2], selector); + c1[1] = hc_byte_perm_S (w4[0], w4[1], selector); + c1[0] = hc_byte_perm_S (w3[3], w4[0], selector); + c0[3] = hc_byte_perm_S (w3[2], w3[3], selector); + c0[2] = hc_byte_perm_S (w3[1], w3[2], selector); + c0[1] = hc_byte_perm_S (w3[0], w3[1], selector); + c0[0] = hc_byte_perm_S (w2[3], w3[0], selector); + w7[3] = hc_byte_perm_S (w2[2], w2[3], selector); + w7[2] = hc_byte_perm_S (w2[1], w2[2], selector); + w7[1] = hc_byte_perm_S (w2[0], w2[1], selector); + w7[0] = hc_byte_perm_S (w1[3], w2[0], selector); + w6[3] = hc_byte_perm_S (w1[2], w1[3], selector); + w6[2] = hc_byte_perm_S (w1[1], w1[2], selector); + w6[1] = hc_byte_perm_S (w1[0], w1[1], selector); + w6[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w5[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w5[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w5[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w5[0] = hc_byte_perm_S ( 0, w0[0], selector); + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 21: + c5[1] = hc_byte_perm_S (w7[3], 0, selector); + c5[0] = hc_byte_perm_S (w7[2], w7[3], selector); + c4[3] = hc_byte_perm_S (w7[1], w7[2], selector); + c4[2] = hc_byte_perm_S (w7[0], w7[1], selector); + c4[1] = hc_byte_perm_S (w6[3], w7[0], selector); + c4[0] = hc_byte_perm_S (w6[2], w6[3], selector); + c3[3] = hc_byte_perm_S (w6[1], w6[2], selector); + c3[2] = hc_byte_perm_S (w6[0], w6[1], selector); + c3[1] = hc_byte_perm_S (w5[3], w6[0], selector); + c3[0] = hc_byte_perm_S (w5[2], w5[3], selector); + c2[3] = hc_byte_perm_S (w5[1], w5[2], selector); + c2[2] = hc_byte_perm_S (w5[0], w5[1], selector); + c2[1] = hc_byte_perm_S (w4[3], w5[0], selector); + c2[0] = hc_byte_perm_S (w4[2], w4[3], selector); + c1[3] = hc_byte_perm_S (w4[1], w4[2], selector); + c1[2] = hc_byte_perm_S (w4[0], w4[1], selector); + c1[1] = hc_byte_perm_S (w3[3], w4[0], selector); + c1[0] = hc_byte_perm_S (w3[2], w3[3], selector); + c0[3] = hc_byte_perm_S (w3[1], w3[2], selector); + c0[2] = hc_byte_perm_S (w3[0], w3[1], selector); + c0[1] = hc_byte_perm_S (w2[3], w3[0], selector); + c0[0] = hc_byte_perm_S (w2[2], w2[3], selector); + w7[3] = hc_byte_perm_S (w2[1], w2[2], selector); + w7[2] = hc_byte_perm_S (w2[0], w2[1], selector); + w7[1] = hc_byte_perm_S (w1[3], w2[0], selector); + w7[0] = hc_byte_perm_S (w1[2], w1[3], selector); + w6[3] = hc_byte_perm_S (w1[1], w1[2], selector); + w6[2] = hc_byte_perm_S (w1[0], w1[1], selector); + w6[1] = hc_byte_perm_S (w0[3], w1[0], selector); + w6[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w5[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w5[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w5[1] = hc_byte_perm_S ( 0, w0[0], selector); + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 22: + c5[2] = hc_byte_perm_S (w7[3], 0, selector); + c5[1] = hc_byte_perm_S (w7[2], w7[3], selector); + c5[0] = hc_byte_perm_S (w7[1], w7[2], selector); + c4[3] = hc_byte_perm_S (w7[0], w7[1], selector); + c4[2] = hc_byte_perm_S (w6[3], w7[0], selector); + c4[1] = hc_byte_perm_S (w6[2], w6[3], selector); + c4[0] = hc_byte_perm_S (w6[1], w6[2], selector); + c3[3] = hc_byte_perm_S (w6[0], w6[1], selector); + c3[2] = hc_byte_perm_S (w5[3], w6[0], selector); + c3[1] = hc_byte_perm_S (w5[2], w5[3], selector); + c3[0] = hc_byte_perm_S (w5[1], w5[2], selector); + c2[3] = hc_byte_perm_S (w5[0], w5[1], selector); + c2[2] = hc_byte_perm_S (w4[3], w5[0], selector); + c2[1] = hc_byte_perm_S (w4[2], w4[3], selector); + c2[0] = hc_byte_perm_S (w4[1], w4[2], selector); + c1[3] = hc_byte_perm_S (w4[0], w4[1], selector); + c1[2] = hc_byte_perm_S (w3[3], w4[0], selector); + c1[1] = hc_byte_perm_S (w3[2], w3[3], selector); + c1[0] = hc_byte_perm_S (w3[1], w3[2], selector); + c0[3] = hc_byte_perm_S (w3[0], w3[1], selector); + c0[2] = hc_byte_perm_S (w2[3], w3[0], selector); + c0[1] = hc_byte_perm_S (w2[2], w2[3], selector); + c0[0] = hc_byte_perm_S (w2[1], w2[2], selector); + w7[3] = hc_byte_perm_S (w2[0], w2[1], selector); + w7[2] = hc_byte_perm_S (w1[3], w2[0], selector); + w7[1] = hc_byte_perm_S (w1[2], w1[3], selector); + w7[0] = hc_byte_perm_S (w1[1], w1[2], selector); + w6[3] = hc_byte_perm_S (w1[0], w1[1], selector); + w6[2] = hc_byte_perm_S (w0[3], w1[0], selector); + w6[1] = hc_byte_perm_S (w0[2], w0[3], selector); + w6[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w5[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w5[2] = hc_byte_perm_S ( 0, w0[0], selector); + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 23: + c5[3] = hc_byte_perm_S (w7[3], 0, selector); + c5[2] = hc_byte_perm_S (w7[2], w7[3], selector); + c5[1] = hc_byte_perm_S (w7[1], w7[2], selector); + c5[0] = hc_byte_perm_S (w7[0], w7[1], selector); + c4[3] = hc_byte_perm_S (w6[3], w7[0], selector); + c4[2] = hc_byte_perm_S (w6[2], w6[3], selector); + c4[1] = hc_byte_perm_S (w6[1], w6[2], selector); + c4[0] = hc_byte_perm_S (w6[0], w6[1], selector); + c3[3] = hc_byte_perm_S (w5[3], w6[0], selector); + c3[2] = hc_byte_perm_S (w5[2], w5[3], selector); + c3[1] = hc_byte_perm_S (w5[1], w5[2], selector); + c3[0] = hc_byte_perm_S (w5[0], w5[1], selector); + c2[3] = hc_byte_perm_S (w4[3], w5[0], selector); + c2[2] = hc_byte_perm_S (w4[2], w4[3], selector); + c2[1] = hc_byte_perm_S (w4[1], w4[2], selector); + c2[0] = hc_byte_perm_S (w4[0], w4[1], selector); + c1[3] = hc_byte_perm_S (w3[3], w4[0], selector); + c1[2] = hc_byte_perm_S (w3[2], w3[3], selector); + c1[1] = hc_byte_perm_S (w3[1], w3[2], selector); + c1[0] = hc_byte_perm_S (w3[0], w3[1], selector); + c0[3] = hc_byte_perm_S (w2[3], w3[0], selector); + c0[2] = hc_byte_perm_S (w2[2], w2[3], selector); + c0[1] = hc_byte_perm_S (w2[1], w2[2], selector); + c0[0] = hc_byte_perm_S (w2[0], w2[1], selector); + w7[3] = hc_byte_perm_S (w1[3], w2[0], selector); + w7[2] = hc_byte_perm_S (w1[2], w1[3], selector); + w7[1] = hc_byte_perm_S (w1[1], w1[2], selector); + w7[0] = hc_byte_perm_S (w1[0], w1[1], selector); + w6[3] = hc_byte_perm_S (w0[3], w1[0], selector); + w6[2] = hc_byte_perm_S (w0[2], w0[3], selector); + w6[1] = hc_byte_perm_S (w0[1], w0[2], selector); + w6[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w5[3] = hc_byte_perm_S ( 0, w0[0], selector); + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 24: + c6[0] = hc_byte_perm_S (w7[3], 0, selector); + c5[3] = hc_byte_perm_S (w7[2], w7[3], selector); + c5[2] = hc_byte_perm_S (w7[1], w7[2], selector); + c5[1] = hc_byte_perm_S (w7[0], w7[1], selector); + c5[0] = hc_byte_perm_S (w6[3], w7[0], selector); + c4[3] = hc_byte_perm_S (w6[2], w6[3], selector); + c4[2] = hc_byte_perm_S (w6[1], w6[2], selector); + c4[1] = hc_byte_perm_S (w6[0], w6[1], selector); + c4[0] = hc_byte_perm_S (w5[3], w6[0], selector); + c3[3] = hc_byte_perm_S (w5[2], w5[3], selector); + c3[2] = hc_byte_perm_S (w5[1], w5[2], selector); + c3[1] = hc_byte_perm_S (w5[0], w5[1], selector); + c3[0] = hc_byte_perm_S (w4[3], w5[0], selector); + c2[3] = hc_byte_perm_S (w4[2], w4[3], selector); + c2[2] = hc_byte_perm_S (w4[1], w4[2], selector); + c2[1] = hc_byte_perm_S (w4[0], w4[1], selector); + c2[0] = hc_byte_perm_S (w3[3], w4[0], selector); + c1[3] = hc_byte_perm_S (w3[2], w3[3], selector); + c1[2] = hc_byte_perm_S (w3[1], w3[2], selector); + c1[1] = hc_byte_perm_S (w3[0], w3[1], selector); + c1[0] = hc_byte_perm_S (w2[3], w3[0], selector); + c0[3] = hc_byte_perm_S (w2[2], w2[3], selector); + c0[2] = hc_byte_perm_S (w2[1], w2[2], selector); + c0[1] = hc_byte_perm_S (w2[0], w2[1], selector); + c0[0] = hc_byte_perm_S (w1[3], w2[0], selector); + w7[3] = hc_byte_perm_S (w1[2], w1[3], selector); + w7[2] = hc_byte_perm_S (w1[1], w1[2], selector); + w7[1] = hc_byte_perm_S (w1[0], w1[1], selector); + w7[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w6[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w6[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w6[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w6[0] = hc_byte_perm_S ( 0, w0[0], selector); + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 25: + c6[1] = hc_byte_perm_S (w7[3], 0, selector); + c6[0] = hc_byte_perm_S (w7[2], w7[3], selector); + c5[3] = hc_byte_perm_S (w7[1], w7[2], selector); + c5[2] = hc_byte_perm_S (w7[0], w7[1], selector); + c5[1] = hc_byte_perm_S (w6[3], w7[0], selector); + c5[0] = hc_byte_perm_S (w6[2], w6[3], selector); + c4[3] = hc_byte_perm_S (w6[1], w6[2], selector); + c4[2] = hc_byte_perm_S (w6[0], w6[1], selector); + c4[1] = hc_byte_perm_S (w5[3], w6[0], selector); + c4[0] = hc_byte_perm_S (w5[2], w5[3], selector); + c3[3] = hc_byte_perm_S (w5[1], w5[2], selector); + c3[2] = hc_byte_perm_S (w5[0], w5[1], selector); + c3[1] = hc_byte_perm_S (w4[3], w5[0], selector); + c3[0] = hc_byte_perm_S (w4[2], w4[3], selector); + c2[3] = hc_byte_perm_S (w4[1], w4[2], selector); + c2[2] = hc_byte_perm_S (w4[0], w4[1], selector); + c2[1] = hc_byte_perm_S (w3[3], w4[0], selector); + c2[0] = hc_byte_perm_S (w3[2], w3[3], selector); + c1[3] = hc_byte_perm_S (w3[1], w3[2], selector); + c1[2] = hc_byte_perm_S (w3[0], w3[1], selector); + c1[1] = hc_byte_perm_S (w2[3], w3[0], selector); + c1[0] = hc_byte_perm_S (w2[2], w2[3], selector); + c0[3] = hc_byte_perm_S (w2[1], w2[2], selector); + c0[2] = hc_byte_perm_S (w2[0], w2[1], selector); + c0[1] = hc_byte_perm_S (w1[3], w2[0], selector); + c0[0] = hc_byte_perm_S (w1[2], w1[3], selector); + w7[3] = hc_byte_perm_S (w1[1], w1[2], selector); + w7[2] = hc_byte_perm_S (w1[0], w1[1], selector); + w7[1] = hc_byte_perm_S (w0[3], w1[0], selector); + w7[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w6[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w6[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w6[1] = hc_byte_perm_S ( 0, w0[0], selector); + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 26: + c6[2] = hc_byte_perm_S (w7[3], 0, selector); + c6[1] = hc_byte_perm_S (w7[2], w7[3], selector); + c6[0] = hc_byte_perm_S (w7[1], w7[2], selector); + c5[3] = hc_byte_perm_S (w7[0], w7[1], selector); + c5[2] = hc_byte_perm_S (w6[3], w7[0], selector); + c5[1] = hc_byte_perm_S (w6[2], w6[3], selector); + c5[0] = hc_byte_perm_S (w6[1], w6[2], selector); + c4[3] = hc_byte_perm_S (w6[0], w6[1], selector); + c4[2] = hc_byte_perm_S (w5[3], w6[0], selector); + c4[1] = hc_byte_perm_S (w5[2], w5[3], selector); + c4[0] = hc_byte_perm_S (w5[1], w5[2], selector); + c3[3] = hc_byte_perm_S (w5[0], w5[1], selector); + c3[2] = hc_byte_perm_S (w4[3], w5[0], selector); + c3[1] = hc_byte_perm_S (w4[2], w4[3], selector); + c3[0] = hc_byte_perm_S (w4[1], w4[2], selector); + c2[3] = hc_byte_perm_S (w4[0], w4[1], selector); + c2[2] = hc_byte_perm_S (w3[3], w4[0], selector); + c2[1] = hc_byte_perm_S (w3[2], w3[3], selector); + c2[0] = hc_byte_perm_S (w3[1], w3[2], selector); + c1[3] = hc_byte_perm_S (w3[0], w3[1], selector); + c1[2] = hc_byte_perm_S (w2[3], w3[0], selector); + c1[1] = hc_byte_perm_S (w2[2], w2[3], selector); + c1[0] = hc_byte_perm_S (w2[1], w2[2], selector); + c0[3] = hc_byte_perm_S (w2[0], w2[1], selector); + c0[2] = hc_byte_perm_S (w1[3], w2[0], selector); + c0[1] = hc_byte_perm_S (w1[2], w1[3], selector); + c0[0] = hc_byte_perm_S (w1[1], w1[2], selector); + w7[3] = hc_byte_perm_S (w1[0], w1[1], selector); + w7[2] = hc_byte_perm_S (w0[3], w1[0], selector); + w7[1] = hc_byte_perm_S (w0[2], w0[3], selector); + w7[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w6[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w6[2] = hc_byte_perm_S ( 0, w0[0], selector); + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 27: + c6[3] = hc_byte_perm_S (w7[3], 0, selector); + c6[2] = hc_byte_perm_S (w7[2], w7[3], selector); + c6[1] = hc_byte_perm_S (w7[1], w7[2], selector); + c6[0] = hc_byte_perm_S (w7[0], w7[1], selector); + c5[3] = hc_byte_perm_S (w6[3], w7[0], selector); + c5[2] = hc_byte_perm_S (w6[2], w6[3], selector); + c5[1] = hc_byte_perm_S (w6[1], w6[2], selector); + c5[0] = hc_byte_perm_S (w6[0], w6[1], selector); + c4[3] = hc_byte_perm_S (w5[3], w6[0], selector); + c4[2] = hc_byte_perm_S (w5[2], w5[3], selector); + c4[1] = hc_byte_perm_S (w5[1], w5[2], selector); + c4[0] = hc_byte_perm_S (w5[0], w5[1], selector); + c3[3] = hc_byte_perm_S (w4[3], w5[0], selector); + c3[2] = hc_byte_perm_S (w4[2], w4[3], selector); + c3[1] = hc_byte_perm_S (w4[1], w4[2], selector); + c3[0] = hc_byte_perm_S (w4[0], w4[1], selector); + c2[3] = hc_byte_perm_S (w3[3], w4[0], selector); + c2[2] = hc_byte_perm_S (w3[2], w3[3], selector); + c2[1] = hc_byte_perm_S (w3[1], w3[2], selector); + c2[0] = hc_byte_perm_S (w3[0], w3[1], selector); + c1[3] = hc_byte_perm_S (w2[3], w3[0], selector); + c1[2] = hc_byte_perm_S (w2[2], w2[3], selector); + c1[1] = hc_byte_perm_S (w2[1], w2[2], selector); + c1[0] = hc_byte_perm_S (w2[0], w2[1], selector); + c0[3] = hc_byte_perm_S (w1[3], w2[0], selector); + c0[2] = hc_byte_perm_S (w1[2], w1[3], selector); + c0[1] = hc_byte_perm_S (w1[1], w1[2], selector); + c0[0] = hc_byte_perm_S (w1[0], w1[1], selector); + w7[3] = hc_byte_perm_S (w0[3], w1[0], selector); + w7[2] = hc_byte_perm_S (w0[2], w0[3], selector); + w7[1] = hc_byte_perm_S (w0[1], w0[2], selector); + w7[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w6[3] = hc_byte_perm_S ( 0, w0[0], selector); + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 28: + c7[0] = hc_byte_perm_S (w7[3], 0, selector); + c6[3] = hc_byte_perm_S (w7[2], w7[3], selector); + c6[2] = hc_byte_perm_S (w7[1], w7[2], selector); + c6[1] = hc_byte_perm_S (w7[0], w7[1], selector); + c6[0] = hc_byte_perm_S (w6[3], w7[0], selector); + c5[3] = hc_byte_perm_S (w6[2], w6[3], selector); + c5[2] = hc_byte_perm_S (w6[1], w6[2], selector); + c5[1] = hc_byte_perm_S (w6[0], w6[1], selector); + c5[0] = hc_byte_perm_S (w5[3], w6[0], selector); + c4[3] = hc_byte_perm_S (w5[2], w5[3], selector); + c4[2] = hc_byte_perm_S (w5[1], w5[2], selector); + c4[1] = hc_byte_perm_S (w5[0], w5[1], selector); + c4[0] = hc_byte_perm_S (w4[3], w5[0], selector); + c3[3] = hc_byte_perm_S (w4[2], w4[3], selector); + c3[2] = hc_byte_perm_S (w4[1], w4[2], selector); + c3[1] = hc_byte_perm_S (w4[0], w4[1], selector); + c3[0] = hc_byte_perm_S (w3[3], w4[0], selector); + c2[3] = hc_byte_perm_S (w3[2], w3[3], selector); + c2[2] = hc_byte_perm_S (w3[1], w3[2], selector); + c2[1] = hc_byte_perm_S (w3[0], w3[1], selector); + c2[0] = hc_byte_perm_S (w2[3], w3[0], selector); + c1[3] = hc_byte_perm_S (w2[2], w2[3], selector); + c1[2] = hc_byte_perm_S (w2[1], w2[2], selector); + c1[1] = hc_byte_perm_S (w2[0], w2[1], selector); + c1[0] = hc_byte_perm_S (w1[3], w2[0], selector); + c0[3] = hc_byte_perm_S (w1[2], w1[3], selector); + c0[2] = hc_byte_perm_S (w1[1], w1[2], selector); + c0[1] = hc_byte_perm_S (w1[0], w1[1], selector); + c0[0] = hc_byte_perm_S (w0[3], w1[0], selector); + w7[3] = hc_byte_perm_S (w0[2], w0[3], selector); + w7[2] = hc_byte_perm_S (w0[1], w0[2], selector); + w7[1] = hc_byte_perm_S (w0[0], w0[1], selector); + w7[0] = hc_byte_perm_S ( 0, w0[0], selector); + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 29: + c7[1] = hc_byte_perm_S (w7[3], 0, selector); + c7[0] = hc_byte_perm_S (w7[2], w7[3], selector); + c6[3] = hc_byte_perm_S (w7[1], w7[2], selector); + c6[2] = hc_byte_perm_S (w7[0], w7[1], selector); + c6[1] = hc_byte_perm_S (w6[3], w7[0], selector); + c6[0] = hc_byte_perm_S (w6[2], w6[3], selector); + c5[3] = hc_byte_perm_S (w6[1], w6[2], selector); + c5[2] = hc_byte_perm_S (w6[0], w6[1], selector); + c5[1] = hc_byte_perm_S (w5[3], w6[0], selector); + c5[0] = hc_byte_perm_S (w5[2], w5[3], selector); + c4[3] = hc_byte_perm_S (w5[1], w5[2], selector); + c4[2] = hc_byte_perm_S (w5[0], w5[1], selector); + c4[1] = hc_byte_perm_S (w4[3], w5[0], selector); + c4[0] = hc_byte_perm_S (w4[2], w4[3], selector); + c3[3] = hc_byte_perm_S (w4[1], w4[2], selector); + c3[2] = hc_byte_perm_S (w4[0], w4[1], selector); + c3[1] = hc_byte_perm_S (w3[3], w4[0], selector); + c3[0] = hc_byte_perm_S (w3[2], w3[3], selector); + c2[3] = hc_byte_perm_S (w3[1], w3[2], selector); + c2[2] = hc_byte_perm_S (w3[0], w3[1], selector); + c2[1] = hc_byte_perm_S (w2[3], w3[0], selector); + c2[0] = hc_byte_perm_S (w2[2], w2[3], selector); + c1[3] = hc_byte_perm_S (w2[1], w2[2], selector); + c1[2] = hc_byte_perm_S (w2[0], w2[1], selector); + c1[1] = hc_byte_perm_S (w1[3], w2[0], selector); + c1[0] = hc_byte_perm_S (w1[2], w1[3], selector); + c0[3] = hc_byte_perm_S (w1[1], w1[2], selector); + c0[2] = hc_byte_perm_S (w1[0], w1[1], selector); + c0[1] = hc_byte_perm_S (w0[3], w1[0], selector); + c0[0] = hc_byte_perm_S (w0[2], w0[3], selector); + w7[3] = hc_byte_perm_S (w0[1], w0[2], selector); + w7[2] = hc_byte_perm_S (w0[0], w0[1], selector); + w7[1] = hc_byte_perm_S ( 0, w0[0], selector); + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 30: + c7[2] = hc_byte_perm_S (w7[3], 0, selector); + c7[1] = hc_byte_perm_S (w7[2], w7[3], selector); + c7[0] = hc_byte_perm_S (w7[1], w7[2], selector); + c6[3] = hc_byte_perm_S (w7[0], w7[1], selector); + c6[2] = hc_byte_perm_S (w6[3], w7[0], selector); + c6[1] = hc_byte_perm_S (w6[2], w6[3], selector); + c6[0] = hc_byte_perm_S (w6[1], w6[2], selector); + c5[3] = hc_byte_perm_S (w6[0], w6[1], selector); + c5[2] = hc_byte_perm_S (w5[3], w6[0], selector); + c5[1] = hc_byte_perm_S (w5[2], w5[3], selector); + c5[0] = hc_byte_perm_S (w5[1], w5[2], selector); + c4[3] = hc_byte_perm_S (w5[0], w5[1], selector); + c4[2] = hc_byte_perm_S (w4[3], w5[0], selector); + c4[1] = hc_byte_perm_S (w4[2], w4[3], selector); + c4[0] = hc_byte_perm_S (w4[1], w4[2], selector); + c3[3] = hc_byte_perm_S (w4[0], w4[1], selector); + c3[2] = hc_byte_perm_S (w3[3], w4[0], selector); + c3[1] = hc_byte_perm_S (w3[2], w3[3], selector); + c3[0] = hc_byte_perm_S (w3[1], w3[2], selector); + c2[3] = hc_byte_perm_S (w3[0], w3[1], selector); + c2[2] = hc_byte_perm_S (w2[3], w3[0], selector); + c2[1] = hc_byte_perm_S (w2[2], w2[3], selector); + c2[0] = hc_byte_perm_S (w2[1], w2[2], selector); + c1[3] = hc_byte_perm_S (w2[0], w2[1], selector); + c1[2] = hc_byte_perm_S (w1[3], w2[0], selector); + c1[1] = hc_byte_perm_S (w1[2], w1[3], selector); + c1[0] = hc_byte_perm_S (w1[1], w1[2], selector); + c0[3] = hc_byte_perm_S (w1[0], w1[1], selector); + c0[2] = hc_byte_perm_S (w0[3], w1[0], selector); + c0[1] = hc_byte_perm_S (w0[2], w0[3], selector); + c0[0] = hc_byte_perm_S (w0[1], w0[2], selector); + w7[3] = hc_byte_perm_S (w0[0], w0[1], selector); + w7[2] = hc_byte_perm_S ( 0, w0[0], selector); + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 31: + c7[3] = hc_byte_perm_S (w7[3], 0, selector); + c7[2] = hc_byte_perm_S (w7[2], w7[3], selector); + c7[1] = hc_byte_perm_S (w7[1], w7[2], selector); + c7[0] = hc_byte_perm_S (w7[0], w7[1], selector); + c6[3] = hc_byte_perm_S (w6[3], w7[0], selector); + c6[2] = hc_byte_perm_S (w6[2], w6[3], selector); + c6[1] = hc_byte_perm_S (w6[1], w6[2], selector); + c6[0] = hc_byte_perm_S (w6[0], w6[1], selector); + c5[3] = hc_byte_perm_S (w5[3], w6[0], selector); + c5[2] = hc_byte_perm_S (w5[2], w5[3], selector); + c5[1] = hc_byte_perm_S (w5[1], w5[2], selector); + c5[0] = hc_byte_perm_S (w5[0], w5[1], selector); + c4[3] = hc_byte_perm_S (w4[3], w5[0], selector); + c4[2] = hc_byte_perm_S (w4[2], w4[3], selector); + c4[1] = hc_byte_perm_S (w4[1], w4[2], selector); + c4[0] = hc_byte_perm_S (w4[0], w4[1], selector); + c3[3] = hc_byte_perm_S (w3[3], w4[0], selector); + c3[2] = hc_byte_perm_S (w3[2], w3[3], selector); + c3[1] = hc_byte_perm_S (w3[1], w3[2], selector); + c3[0] = hc_byte_perm_S (w3[0], w3[1], selector); + c2[3] = hc_byte_perm_S (w2[3], w3[0], selector); + c2[2] = hc_byte_perm_S (w2[2], w2[3], selector); + c2[1] = hc_byte_perm_S (w2[1], w2[2], selector); + c2[0] = hc_byte_perm_S (w2[0], w2[1], selector); + c1[3] = hc_byte_perm_S (w1[3], w2[0], selector); + c1[2] = hc_byte_perm_S (w1[2], w1[3], selector); + c1[1] = hc_byte_perm_S (w1[1], w1[2], selector); + c1[0] = hc_byte_perm_S (w1[0], w1[1], selector); + c0[3] = hc_byte_perm_S (w0[3], w1[0], selector); + c0[2] = hc_byte_perm_S (w0[2], w0[3], selector); + c0[1] = hc_byte_perm_S (w0[1], w0[2], selector); + c0[0] = hc_byte_perm_S (w0[0], w0[1], selector); + w7[3] = hc_byte_perm_S ( 0, w0[0], selector); + w7[2] = 0; + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + } + #endif +} + DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *w4, u32 *w5, u32 *w6, u32 *w7, const u32 offset) { const int offset_switch = offset / 4; diff --git a/OpenCL/inc_common.h b/OpenCL/inc_common.h index 07137297b..6e39b2ca3 100644 --- a/OpenCL/inc_common.h +++ b/OpenCL/inc_common.h @@ -262,6 +262,7 @@ DECLSPEC void switch_buffer_by_offset_carry_le (u32x *w0, u32x *w1, u32x *w2, u3 DECLSPEC void switch_buffer_by_offset_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3, const u32 offset); DECLSPEC void switch_buffer_by_offset_carry_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *c0, u32x *c1, u32x *c2, u32x *c3, const u32 offset); DECLSPEC void switch_buffer_by_offset_8x4_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *w4, u32x *w5, u32x *w6, u32x *w7, const u32 offset); +DECLSPEC void switch_buffer_by_offset_8x4_carry_le (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *w4, u32x *w5, u32x *w6, u32x *w7, u32x *c0, u32x *c1, u32x *c2, u32x *c3, u32x *c4, u32x *c5, u32x *c6, u32x *c7, const u32 offset); DECLSPEC void switch_buffer_by_offset_8x4_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *w4, u32x *w5, u32x *w6, u32x *w7, const u32 offset); DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *w4, u32x *w5, u32x *w6, u32x *w7, u32x *c0, u32x *c1, u32x *c2, u32x *c3, u32x *c4, u32x *c5, u32x *c6, u32x *c7, const u32 offset); DECLSPEC void switch_buffer_by_offset_1x64_le (u32x *w, const u32 offset); @@ -289,6 +290,7 @@ DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 DECLSPEC void switch_buffer_by_offset_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 offset); DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *c0, u32 *c1, u32 *c2, u32 *c3, const u32 offset); DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *w4, u32 *w5, u32 *w6, u32 *w7, const u32 offset); +DECLSPEC void switch_buffer_by_offset_8x4_carry_le_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *w4, u32 *w5, u32 *w6, u32 *w7, u32 *c0, u32 *c1, u32 *c2, u32 *c3, u32 *c4, u32 *c5, u32 *c6, u32 *c7, const u32 offset); DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *w4, u32 *w5, u32 *w6, u32 *w7, const u32 offset); DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *w4, u32 *w5, u32 *w6, u32 *w7, u32 *c0, u32 *c1, u32 *c2, u32 *c3, u32 *c4, u32 *c5, u32 *c6, u32 *c7, const u32 offset); DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 *w, const u32 offset); diff --git a/OpenCL/inc_hash_blake2b.cl b/OpenCL/inc_hash_blake2b.cl new file mode 100644 index 000000000..ac4377c2f --- /dev/null +++ b/OpenCL/inc_hash_blake2b.cl @@ -0,0 +1,662 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#include "inc_vendor.h" +#include "inc_types.h" +#include "inc_platform.h" +#include "inc_common.h" +#include "inc_hash_blake2b.h" + +DECLSPEC void blake2b_transform (u64 *h, const u64 *m, const u32 len, const u64 f0) +{ + const u64 t0 = hl32_to_64_S (0, len); + + u64 v[16]; + + v[ 0] = h[0]; + v[ 1] = h[1]; + v[ 2] = h[2]; + v[ 3] = h[3]; + v[ 4] = h[4]; + v[ 5] = h[5]; + v[ 6] = h[6]; + v[ 7] = h[7]; + v[ 8] = BLAKE2B_IV_00; + v[ 9] = BLAKE2B_IV_01; + v[10] = BLAKE2B_IV_02; + v[11] = BLAKE2B_IV_03; + v[12] = BLAKE2B_IV_04 ^ t0; + v[13] = BLAKE2B_IV_05; // ^ t1; + v[14] = BLAKE2B_IV_06 ^ f0; + v[15] = BLAKE2B_IV_07; // ^ f1; + + BLAKE2B_ROUND ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + BLAKE2B_ROUND (14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3); + BLAKE2B_ROUND (11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4); + BLAKE2B_ROUND ( 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8); + BLAKE2B_ROUND ( 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13); + BLAKE2B_ROUND ( 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9); + BLAKE2B_ROUND (12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11); + BLAKE2B_ROUND (13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10); + BLAKE2B_ROUND ( 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5); + BLAKE2B_ROUND (10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0); + BLAKE2B_ROUND ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + BLAKE2B_ROUND (14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3); + + h[0] = h[0] ^ v[0] ^ v[ 8]; + h[1] = h[1] ^ v[1] ^ v[ 9]; + h[2] = h[2] ^ v[2] ^ v[10]; + h[3] = h[3] ^ v[3] ^ v[11]; + h[4] = h[4] ^ v[4] ^ v[12]; + h[5] = h[5] ^ v[5] ^ v[13]; + h[6] = h[6] ^ v[6] ^ v[14]; + h[7] = h[7] ^ v[7] ^ v[15]; +} + +DECLSPEC void blake2b_init (blake2b_ctx_t *ctx) +{ + ctx->h[0] = BLAKE2B_IV_00 ^ 0x01010040; // default output length: 0x40 = 64 bytes + ctx->h[1] = BLAKE2B_IV_01; + ctx->h[2] = BLAKE2B_IV_02; + ctx->h[3] = BLAKE2B_IV_03; + ctx->h[4] = BLAKE2B_IV_04; + ctx->h[5] = BLAKE2B_IV_05; + ctx->h[6] = BLAKE2B_IV_06; + ctx->h[7] = BLAKE2B_IV_07; + + ctx->m[ 0] = 0; + ctx->m[ 1] = 0; + ctx->m[ 2] = 0; + ctx->m[ 3] = 0; + ctx->m[ 4] = 0; + ctx->m[ 5] = 0; + ctx->m[ 6] = 0; + ctx->m[ 7] = 0; + ctx->m[ 8] = 0; + ctx->m[ 9] = 0; + ctx->m[10] = 0; + ctx->m[11] = 0; + ctx->m[12] = 0; + ctx->m[13] = 0; + ctx->m[14] = 0; + ctx->m[15] = 0; + + ctx->len = 0; +} + +DECLSPEC void blake2b_update_128 (blake2b_ctx_t *ctx, u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *w4, u32 *w5, u32 *w6, u32 *w7, const u32 len) +{ + MAYBE_VOLATILE const u32 pos = ctx->len & 127; + + if (pos == 0) + { + if (ctx->len > 0) // if new block (pos == 0) AND the (old) len is not zero => transform + { + blake2b_transform (ctx->h, ctx->m, ctx->len, BLAKE2B_UPDATE); + } + + ctx->m[ 0] = hl32_to_64_S (w0[1], w0[0]); + ctx->m[ 1] = hl32_to_64_S (w0[3], w0[2]); + ctx->m[ 2] = hl32_to_64_S (w1[1], w1[0]); + ctx->m[ 3] = hl32_to_64_S (w1[3], w1[2]); + ctx->m[ 4] = hl32_to_64_S (w2[1], w2[0]); + ctx->m[ 5] = hl32_to_64_S (w2[3], w2[2]); + ctx->m[ 6] = hl32_to_64_S (w3[1], w3[0]); + ctx->m[ 7] = hl32_to_64_S (w3[3], w3[2]); + ctx->m[ 8] = hl32_to_64_S (w4[1], w4[0]); + ctx->m[ 9] = hl32_to_64_S (w4[3], w4[2]); + ctx->m[10] = hl32_to_64_S (w5[1], w5[0]); + ctx->m[11] = hl32_to_64_S (w5[3], w5[2]); + ctx->m[12] = hl32_to_64_S (w6[1], w6[0]); + ctx->m[13] = hl32_to_64_S (w6[3], w6[2]); + ctx->m[14] = hl32_to_64_S (w7[1], w7[0]); + ctx->m[15] = hl32_to_64_S (w7[3], w7[2]); + } + else + { + if ((pos + len) <= 128) + { + switch_buffer_by_offset_8x4_le_S (w0, w1, w2, w3, w4, w5, w6, w7, pos); + + ctx->m[ 0] |= hl32_to_64_S (w0[1], w0[0]); + ctx->m[ 1] |= hl32_to_64_S (w0[3], w0[2]); + ctx->m[ 2] |= hl32_to_64_S (w1[1], w1[0]); + ctx->m[ 3] |= hl32_to_64_S (w1[3], w1[2]); + ctx->m[ 4] |= hl32_to_64_S (w2[1], w2[0]); + ctx->m[ 5] |= hl32_to_64_S (w2[3], w2[2]); + ctx->m[ 6] |= hl32_to_64_S (w3[1], w3[0]); + ctx->m[ 7] |= hl32_to_64_S (w3[3], w3[2]); + ctx->m[ 8] |= hl32_to_64_S (w4[1], w4[0]); + ctx->m[ 9] |= hl32_to_64_S (w4[3], w4[2]); + ctx->m[10] |= hl32_to_64_S (w5[1], w5[0]); + ctx->m[11] |= hl32_to_64_S (w5[3], w5[2]); + ctx->m[12] |= hl32_to_64_S (w6[1], w6[0]); + ctx->m[13] |= hl32_to_64_S (w6[3], w6[2]); + ctx->m[14] |= hl32_to_64_S (w7[1], w7[0]); + ctx->m[15] |= hl32_to_64_S (w7[3], w7[2]); + } + else + { + u32 c0[4] = { 0 }; + u32 c1[4] = { 0 }; + u32 c2[4] = { 0 }; + u32 c3[4] = { 0 }; + u32 c4[4] = { 0 }; + u32 c5[4] = { 0 }; + u32 c6[4] = { 0 }; + u32 c7[4] = { 0 }; + + switch_buffer_by_offset_8x4_carry_le_S (w0, w1, w2, w3, w4, w5, w6, w7, c0, c1, c2, c3, c4, c5, c6, c7, pos); + + ctx->m[ 0] |= hl32_to_64_S (w0[1], w0[0]); + ctx->m[ 1] |= hl32_to_64_S (w0[3], w0[2]); + ctx->m[ 2] |= hl32_to_64_S (w1[1], w1[0]); + ctx->m[ 3] |= hl32_to_64_S (w1[3], w1[2]); + ctx->m[ 4] |= hl32_to_64_S (w2[1], w2[0]); + ctx->m[ 5] |= hl32_to_64_S (w2[3], w2[2]); + ctx->m[ 6] |= hl32_to_64_S (w3[1], w3[0]); + ctx->m[ 7] |= hl32_to_64_S (w3[3], w3[2]); + ctx->m[ 8] |= hl32_to_64_S (w4[1], w4[0]); + ctx->m[ 9] |= hl32_to_64_S (w4[3], w4[2]); + ctx->m[10] |= hl32_to_64_S (w5[1], w5[0]); + ctx->m[11] |= hl32_to_64_S (w5[3], w5[2]); + ctx->m[12] |= hl32_to_64_S (w6[1], w6[0]); + ctx->m[13] |= hl32_to_64_S (w6[3], w6[2]); + ctx->m[14] |= hl32_to_64_S (w7[1], w7[0]); + ctx->m[15] |= hl32_to_64_S (w7[3], w7[2]); + + // len must be a multiple of 128 (not ctx->len) for BLAKE2B_UPDATE: + + const u32 cur_len = ((ctx->len + len) / 128) * 128; + + blake2b_transform (ctx->h, ctx->m, cur_len, BLAKE2B_UPDATE); + + ctx->m[ 0] = hl32_to_64_S (c0[1], c0[0]); + ctx->m[ 1] = hl32_to_64_S (c0[3], c0[2]); + ctx->m[ 2] = hl32_to_64_S (c1[1], c1[0]); + ctx->m[ 3] = hl32_to_64_S (c1[3], c1[2]); + ctx->m[ 4] = hl32_to_64_S (c2[1], c2[0]); + ctx->m[ 5] = hl32_to_64_S (c2[3], c2[2]); + ctx->m[ 6] = hl32_to_64_S (c3[1], c3[0]); + ctx->m[ 7] = hl32_to_64_S (c3[3], c3[2]); + ctx->m[ 8] = hl32_to_64_S (c4[1], c4[0]); + ctx->m[ 9] = hl32_to_64_S (c4[3], c4[2]); + ctx->m[10] = hl32_to_64_S (c5[1], c5[0]); + ctx->m[11] = hl32_to_64_S (c5[3], c5[2]); + ctx->m[12] = hl32_to_64_S (c6[1], c6[0]); + ctx->m[13] = hl32_to_64_S (c6[3], c6[2]); + ctx->m[14] = hl32_to_64_S (c7[1], c7[0]); + ctx->m[15] = hl32_to_64_S (c7[3], c7[2]); + } + } + + ctx->len += len; +} + +DECLSPEC void blake2b_update (blake2b_ctx_t *ctx, const u32 *w, const u32 len) +{ + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + u32 w4[4]; + u32 w5[4]; + u32 w6[4]; + u32 w7[4]; + + const int limit = (const int) len - 128; // int type needed, could be negative + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < limit; pos1 += 128, pos4 += 32) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + w4[0] = w[pos4 + 16]; + w4[1] = w[pos4 + 17]; + w4[2] = w[pos4 + 18]; + w4[3] = w[pos4 + 19]; + w5[0] = w[pos4 + 20]; + w5[1] = w[pos4 + 21]; + w5[2] = w[pos4 + 22]; + w5[3] = w[pos4 + 23]; + w6[0] = w[pos4 + 24]; + w6[1] = w[pos4 + 25]; + w6[2] = w[pos4 + 26]; + w6[3] = w[pos4 + 27]; + w7[0] = w[pos4 + 28]; + w7[1] = w[pos4 + 29]; + w7[2] = w[pos4 + 30]; + w7[3] = w[pos4 + 31]; + + blake2b_update_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, 128); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + w4[0] = w[pos4 + 16]; + w4[1] = w[pos4 + 17]; + w4[2] = w[pos4 + 18]; + w4[3] = w[pos4 + 19]; + w5[0] = w[pos4 + 20]; + w5[1] = w[pos4 + 21]; + w5[2] = w[pos4 + 22]; + w5[3] = w[pos4 + 23]; + w6[0] = w[pos4 + 24]; + w6[1] = w[pos4 + 25]; + w6[2] = w[pos4 + 26]; + w6[3] = w[pos4 + 27]; + w7[0] = w[pos4 + 28]; + w7[1] = w[pos4 + 29]; + w7[2] = w[pos4 + 30]; + w7[3] = w[pos4 + 31]; + + blake2b_update_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, len - (u32) pos1); +} + +DECLSPEC void blake2b_update_global (blake2b_ctx_t *ctx, GLOBAL_AS const u32 *w, const u32 len) +{ + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + u32 w4[4]; + u32 w5[4]; + u32 w6[4]; + u32 w7[4]; + + const int limit = (const int) len - 128; // int type needed, could be negative + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < limit; pos1 += 128, pos4 += 32) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + w4[0] = w[pos4 + 16]; + w4[1] = w[pos4 + 17]; + w4[2] = w[pos4 + 18]; + w4[3] = w[pos4 + 19]; + w5[0] = w[pos4 + 20]; + w5[1] = w[pos4 + 21]; + w5[2] = w[pos4 + 22]; + w5[3] = w[pos4 + 23]; + w6[0] = w[pos4 + 24]; + w6[1] = w[pos4 + 25]; + w6[2] = w[pos4 + 26]; + w6[3] = w[pos4 + 27]; + w7[0] = w[pos4 + 28]; + w7[1] = w[pos4 + 29]; + w7[2] = w[pos4 + 30]; + w7[3] = w[pos4 + 31]; + + blake2b_update_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, 128); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + w4[0] = w[pos4 + 16]; + w4[1] = w[pos4 + 17]; + w4[2] = w[pos4 + 18]; + w4[3] = w[pos4 + 19]; + w5[0] = w[pos4 + 20]; + w5[1] = w[pos4 + 21]; + w5[2] = w[pos4 + 22]; + w5[3] = w[pos4 + 23]; + w6[0] = w[pos4 + 24]; + w6[1] = w[pos4 + 25]; + w6[2] = w[pos4 + 26]; + w6[3] = w[pos4 + 27]; + w7[0] = w[pos4 + 28]; + w7[1] = w[pos4 + 29]; + w7[2] = w[pos4 + 30]; + w7[3] = w[pos4 + 31]; + + blake2b_update_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, len - (u32) pos1); +} + +DECLSPEC void blake2b_final (blake2b_ctx_t *ctx) +{ + blake2b_transform (ctx->h, ctx->m, ctx->len, BLAKE2B_FINAL); +} + +DECLSPEC void blake2b_transform_vector (u64x *h, const u64x *m, const u32x len, const u64 f0) +{ + const u64x t0 = hl32_to_64 (0, len); + + u64x v[16]; + + v[ 0] = h[0]; + v[ 1] = h[1]; + v[ 2] = h[2]; + v[ 3] = h[3]; + v[ 4] = h[4]; + v[ 5] = h[5]; + v[ 6] = h[6]; + v[ 7] = h[7]; + v[ 8] = BLAKE2B_IV_00; + v[ 9] = BLAKE2B_IV_01; + v[10] = BLAKE2B_IV_02; + v[11] = BLAKE2B_IV_03; + v[12] = BLAKE2B_IV_04 ^ t0; + v[13] = BLAKE2B_IV_05; // ^ t1; + v[14] = BLAKE2B_IV_06 ^ f0; + v[15] = BLAKE2B_IV_07; // ^ f1; + + BLAKE2B_ROUND_VECTOR ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + BLAKE2B_ROUND_VECTOR (14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3); + BLAKE2B_ROUND_VECTOR (11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4); + BLAKE2B_ROUND_VECTOR ( 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8); + BLAKE2B_ROUND_VECTOR ( 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13); + BLAKE2B_ROUND_VECTOR ( 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9); + BLAKE2B_ROUND_VECTOR (12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11); + BLAKE2B_ROUND_VECTOR (13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10); + BLAKE2B_ROUND_VECTOR ( 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5); + BLAKE2B_ROUND_VECTOR (10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0); + BLAKE2B_ROUND_VECTOR ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + BLAKE2B_ROUND_VECTOR (14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3); + + h[0] = h[0] ^ v[0] ^ v[ 8]; + h[1] = h[1] ^ v[1] ^ v[ 9]; + h[2] = h[2] ^ v[2] ^ v[10]; + h[3] = h[3] ^ v[3] ^ v[11]; + h[4] = h[4] ^ v[4] ^ v[12]; + h[5] = h[5] ^ v[5] ^ v[13]; + h[6] = h[6] ^ v[6] ^ v[14]; + h[7] = h[7] ^ v[7] ^ v[15]; +} + +DECLSPEC void blake2b_init_vector (blake2b_ctx_vector_t *ctx) +{ + ctx->h[0] = BLAKE2B_IV_00 ^ 0x01010040; // default output length: 0x40 = 64 bytes + ctx->h[1] = BLAKE2B_IV_01; + ctx->h[2] = BLAKE2B_IV_02; + ctx->h[3] = BLAKE2B_IV_03; + ctx->h[4] = BLAKE2B_IV_04; + ctx->h[5] = BLAKE2B_IV_05; + ctx->h[6] = BLAKE2B_IV_06; + ctx->h[7] = BLAKE2B_IV_07; + + ctx->m[ 0] = 0; + ctx->m[ 1] = 0; + ctx->m[ 2] = 0; + ctx->m[ 3] = 0; + ctx->m[ 4] = 0; + ctx->m[ 5] = 0; + ctx->m[ 6] = 0; + ctx->m[ 7] = 0; + ctx->m[ 8] = 0; + ctx->m[ 9] = 0; + ctx->m[10] = 0; + ctx->m[11] = 0; + ctx->m[12] = 0; + ctx->m[13] = 0; + ctx->m[14] = 0; + ctx->m[15] = 0; + + ctx->len = 0; +} + +DECLSPEC void blake2b_update_vector_128 (blake2b_ctx_vector_t *ctx, u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *w4, u32x *w5, u32x *w6, u32x *w7, const u32 len) +{ + MAYBE_VOLATILE const u32 pos = ctx->len & 127; + + if (pos == 0) + { + if (ctx->len > 0) // if new block (pos == 0) AND the (old) len is not zero => transform + { + blake2b_transform_vector (ctx->h, ctx->m, (u32x) ctx->len, BLAKE2B_UPDATE); + } + + ctx->m[ 0] = hl32_to_64 (w0[1], w0[0]); + ctx->m[ 1] = hl32_to_64 (w0[3], w0[2]); + ctx->m[ 2] = hl32_to_64 (w1[1], w1[0]); + ctx->m[ 3] = hl32_to_64 (w1[3], w1[2]); + ctx->m[ 4] = hl32_to_64 (w2[1], w2[0]); + ctx->m[ 5] = hl32_to_64 (w2[3], w2[2]); + ctx->m[ 6] = hl32_to_64 (w3[1], w3[0]); + ctx->m[ 7] = hl32_to_64 (w3[3], w3[2]); + ctx->m[ 8] = hl32_to_64 (w4[1], w4[0]); + ctx->m[ 9] = hl32_to_64 (w4[3], w4[2]); + ctx->m[10] = hl32_to_64 (w5[1], w5[0]); + ctx->m[11] = hl32_to_64 (w5[3], w5[2]); + ctx->m[12] = hl32_to_64 (w6[1], w6[0]); + ctx->m[13] = hl32_to_64 (w6[3], w6[2]); + ctx->m[14] = hl32_to_64 (w7[1], w7[0]); + ctx->m[15] = hl32_to_64 (w7[3], w7[2]); + } + else + { + if ((pos + len) <= 128) + { + switch_buffer_by_offset_8x4_le (w0, w1, w2, w3, w4, w5, w6, w7, pos); + + ctx->m[ 0] |= hl32_to_64 (w0[1], w0[0]); + ctx->m[ 1] |= hl32_to_64 (w0[3], w0[2]); + ctx->m[ 2] |= hl32_to_64 (w1[1], w1[0]); + ctx->m[ 3] |= hl32_to_64 (w1[3], w1[2]); + ctx->m[ 4] |= hl32_to_64 (w2[1], w2[0]); + ctx->m[ 5] |= hl32_to_64 (w2[3], w2[2]); + ctx->m[ 6] |= hl32_to_64 (w3[1], w3[0]); + ctx->m[ 7] |= hl32_to_64 (w3[3], w3[2]); + ctx->m[ 8] |= hl32_to_64 (w4[1], w4[0]); + ctx->m[ 9] |= hl32_to_64 (w4[3], w4[2]); + ctx->m[10] |= hl32_to_64 (w5[1], w5[0]); + ctx->m[11] |= hl32_to_64 (w5[3], w5[2]); + ctx->m[12] |= hl32_to_64 (w6[1], w6[0]); + ctx->m[13] |= hl32_to_64 (w6[3], w6[2]); + ctx->m[14] |= hl32_to_64 (w7[1], w7[0]); + ctx->m[15] |= hl32_to_64 (w7[3], w7[2]); + } + else + { + u32x c0[4] = { 0 }; + u32x c1[4] = { 0 }; + u32x c2[4] = { 0 }; + u32x c3[4] = { 0 }; + u32x c4[4] = { 0 }; + u32x c5[4] = { 0 }; + u32x c6[4] = { 0 }; + u32x c7[4] = { 0 }; + + switch_buffer_by_offset_8x4_carry_le (w0, w1, w2, w3, w4, w5, w6, w7, c0, c1, c2, c3, c4, c5, c6, c7, pos); + + ctx->m[ 0] |= hl32_to_64 (w0[1], w0[0]); + ctx->m[ 1] |= hl32_to_64 (w0[3], w0[2]); + ctx->m[ 2] |= hl32_to_64 (w1[1], w1[0]); + ctx->m[ 3] |= hl32_to_64 (w1[3], w1[2]); + ctx->m[ 4] |= hl32_to_64 (w2[1], w2[0]); + ctx->m[ 5] |= hl32_to_64 (w2[3], w2[2]); + ctx->m[ 6] |= hl32_to_64 (w3[1], w3[0]); + ctx->m[ 7] |= hl32_to_64 (w3[3], w3[2]); + ctx->m[ 8] |= hl32_to_64 (w4[1], w4[0]); + ctx->m[ 9] |= hl32_to_64 (w4[3], w4[2]); + ctx->m[10] |= hl32_to_64 (w5[1], w5[0]); + ctx->m[11] |= hl32_to_64 (w5[3], w5[2]); + ctx->m[12] |= hl32_to_64 (w6[1], w6[0]); + ctx->m[13] |= hl32_to_64 (w6[3], w6[2]); + ctx->m[14] |= hl32_to_64 (w7[1], w7[0]); + ctx->m[15] |= hl32_to_64 (w7[3], w7[2]); + + // len must be a multiple of 128 (not ctx->len) for BLAKE2B_UPDATE: + + const u32x cur_len = ((ctx->len + len) / 128) * 128; + + blake2b_transform_vector (ctx->h, ctx->m, cur_len, BLAKE2B_UPDATE); + + ctx->m[ 0] = hl32_to_64 (c0[1], c0[0]); + ctx->m[ 1] = hl32_to_64 (c0[3], c0[2]); + ctx->m[ 2] = hl32_to_64 (c1[1], c1[0]); + ctx->m[ 3] = hl32_to_64 (c1[3], c1[2]); + ctx->m[ 4] = hl32_to_64 (c2[1], c2[0]); + ctx->m[ 5] = hl32_to_64 (c2[3], c2[2]); + ctx->m[ 6] = hl32_to_64 (c3[1], c3[0]); + ctx->m[ 7] = hl32_to_64 (c3[3], c3[2]); + ctx->m[ 8] = hl32_to_64 (c4[1], c4[0]); + ctx->m[ 9] = hl32_to_64 (c4[3], c4[2]); + ctx->m[10] = hl32_to_64 (c5[1], c5[0]); + ctx->m[11] = hl32_to_64 (c5[3], c5[2]); + ctx->m[12] = hl32_to_64 (c6[1], c6[0]); + ctx->m[13] = hl32_to_64 (c6[3], c6[2]); + ctx->m[14] = hl32_to_64 (c7[1], c7[0]); + ctx->m[15] = hl32_to_64 (c7[3], c7[2]); + } + } + + ctx->len += len; +} + +DECLSPEC void blake2b_update_vector (blake2b_ctx_vector_t *ctx, const u32x *w, const u32 len) +{ + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + u32x w4[4]; + u32x w5[4]; + u32x w6[4]; + u32x w7[4]; + + const int limit = (const int) len - 128; // int type needed, could be negative + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < limit; pos1 += 128, pos4 += 32) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + w4[0] = w[pos4 + 16]; + w4[1] = w[pos4 + 17]; + w4[2] = w[pos4 + 18]; + w4[3] = w[pos4 + 19]; + w5[0] = w[pos4 + 20]; + w5[1] = w[pos4 + 21]; + w5[2] = w[pos4 + 22]; + w5[3] = w[pos4 + 23]; + w6[0] = w[pos4 + 24]; + w6[1] = w[pos4 + 25]; + w6[2] = w[pos4 + 26]; + w6[3] = w[pos4 + 27]; + w7[0] = w[pos4 + 28]; + w7[1] = w[pos4 + 29]; + w7[2] = w[pos4 + 30]; + w7[3] = w[pos4 + 31]; + + blake2b_update_vector_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, 128); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + w4[0] = w[pos4 + 16]; + w4[1] = w[pos4 + 17]; + w4[2] = w[pos4 + 18]; + w4[3] = w[pos4 + 19]; + w5[0] = w[pos4 + 20]; + w5[1] = w[pos4 + 21]; + w5[2] = w[pos4 + 22]; + w5[3] = w[pos4 + 23]; + w6[0] = w[pos4 + 24]; + w6[1] = w[pos4 + 25]; + w6[2] = w[pos4 + 26]; + w6[3] = w[pos4 + 27]; + w7[0] = w[pos4 + 28]; + w7[1] = w[pos4 + 29]; + w7[2] = w[pos4 + 30]; + w7[3] = w[pos4 + 31]; + + blake2b_update_vector_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, len - (u32) pos1); +} + +DECLSPEC void blake2b_final_vector (blake2b_ctx_vector_t *ctx) +{ + blake2b_transform_vector (ctx->h, ctx->m, (u32x) ctx->len, BLAKE2B_FINAL); +} diff --git a/OpenCL/inc_hash_blake2b.h b/OpenCL/inc_hash_blake2b.h new file mode 100644 index 000000000..798b651b7 --- /dev/null +++ b/OpenCL/inc_hash_blake2b.h @@ -0,0 +1,90 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#ifndef _INC_HASH_BLAKE2B_H +#define _INC_HASH_BLAKE2B_H + +#define BLAKE2B_UPDATE 0 +#define BLAKE2B_FINAL -1 + +#define BLAKE2B_G(k0,k1,a,b,c,d) \ +{ \ + a = a + b + m[k0]; \ + d = hc_rotr64_S (d ^ a, 32); \ + c = c + d; \ + b = hc_rotr64_S (b ^ c, 24); \ + a = a + b + m[k1]; \ + d = hc_rotr64_S (d ^ a, 16); \ + c = c + d; \ + b = hc_rotr64_S (b ^ c, 63); \ +} + +#define BLAKE2B_ROUND(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,ca,cb,cc,cd,ce,cf) \ +{ \ + BLAKE2B_G (c0, c1, v[0], v[4], v[ 8], v[12]); \ + BLAKE2B_G (c2, c3, v[1], v[5], v[ 9], v[13]); \ + BLAKE2B_G (c4, c5, v[2], v[6], v[10], v[14]); \ + BLAKE2B_G (c6, c7, v[3], v[7], v[11], v[15]); \ + BLAKE2B_G (c8, c9, v[0], v[5], v[10], v[15]); \ + BLAKE2B_G (ca, cb, v[1], v[6], v[11], v[12]); \ + BLAKE2B_G (cc, cd, v[2], v[7], v[ 8], v[13]); \ + BLAKE2B_G (ce, cf, v[3], v[4], v[ 9], v[14]); \ +} + +#define BLAKE2B_G_VECTOR(k0,k1,a,b,c,d) \ +{ \ + a = a + b + m[k0]; \ + d = hc_rotr64 (d ^ a, 32); \ + c = c + d; \ + b = hc_rotr64 (b ^ c, 24); \ + a = a + b + m[k1]; \ + d = hc_rotr64 (d ^ a, 16); \ + c = c + d; \ + b = hc_rotr64 (b ^ c, 63); \ +} + +#define BLAKE2B_ROUND_VECTOR(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,ca,cb,cc,cd,ce,cf) \ +{ \ + BLAKE2B_G_VECTOR (c0, c1, v[0], v[4], v[ 8], v[12]); \ + BLAKE2B_G_VECTOR (c2, c3, v[1], v[5], v[ 9], v[13]); \ + BLAKE2B_G_VECTOR (c4, c5, v[2], v[6], v[10], v[14]); \ + BLAKE2B_G_VECTOR (c6, c7, v[3], v[7], v[11], v[15]); \ + BLAKE2B_G_VECTOR (c8, c9, v[0], v[5], v[10], v[15]); \ + BLAKE2B_G_VECTOR (ca, cb, v[1], v[6], v[11], v[12]); \ + BLAKE2B_G_VECTOR (cc, cd, v[2], v[7], v[ 8], v[13]); \ + BLAKE2B_G_VECTOR (ce, cf, v[3], v[4], v[ 9], v[14]); \ +} + +typedef struct blake2b_ctx +{ + u64 m[16]; // buffer + u64 h[ 8]; // digest + + u32 len; + +} blake2b_ctx_t; + +typedef struct blake2b_ctx_vector +{ + u64x m[16]; // buffer + u64x h[ 8]; // digest + + u32 len; + +} blake2b_ctx_vector_t; + +DECLSPEC void blake2b_transform (u64 *h, const u64 *m, const u32 len, const u64 f0); +DECLSPEC void blake2b_init (blake2b_ctx_t *ctx); +DECLSPEC void blake2b_update (blake2b_ctx_t *ctx, const u32 *w, const u32 len); +DECLSPEC void blake2b_update_global (blake2b_ctx_t *ctx, GLOBAL_AS const u32 *w, const u32 len); +DECLSPEC void blake2b_final (blake2b_ctx_t *ctx); + +DECLSPEC void blake2b_transform_vector (u64x *h, const u64x *m, const u32x len, const u64 f0); +DECLSPEC void blake2b_init_vector (blake2b_ctx_vector_t *ctx); +DECLSPEC void blake2b_update_vector (blake2b_ctx_vector_t *ctx, const u32x *w, const u32 len); +DECLSPEC void blake2b_final_vector (blake2b_ctx_vector_t *ctx); + + +#endif // _INC_HASH_BLAKE2B_H diff --git a/OpenCL/m00600_a0-optimized.cl b/OpenCL/m00600_a0-optimized.cl index 0967e3cff..1a499c113 100644 --- a/OpenCL/m00600_a0-optimized.cl +++ b/OpenCL/m00600_a0-optimized.cl @@ -13,117 +13,15 @@ #include "inc_rp_optimized.h" #include "inc_rp_optimized.cl" #include "inc_simd.cl" +#include "inc_hash_blake2b.cl" #endif -typedef struct blake2 -{ - u64 h[8]; - u64 t[2]; - u64 f[2]; - u32 buflen; - u32 outlen; - -} blake2_t; - -#define BLAKE2B_FINAL 1 -#define BLAKE2B_UPDATE 0 - -#define BLAKE2B_G(k0,k1,a,b,c,d) \ - do { \ - a = a + b + m[(k0)]; \ - d = hc_rotr64 (d ^ a, 32); \ - c = c + d; \ - b = hc_rotr64 (b ^ c, 24); \ - a = a + b + m[(k1)]; \ - d = hc_rotr64 (d ^ a, 16); \ - c = c + d; \ - b = hc_rotr64 (b ^ c, 63); \ - } while (0) - -#define BLAKE2B_ROUND(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,ca,cb,cc,cd,ce,cf) \ - do { \ - BLAKE2B_G ((c0),(c1),v[ 0],v[ 4],v[ 8],v[12]); \ - BLAKE2B_G ((c2),(c3),v[ 1],v[ 5],v[ 9],v[13]); \ - BLAKE2B_G ((c4),(c5),v[ 2],v[ 6],v[10],v[14]); \ - BLAKE2B_G ((c6),(c7),v[ 3],v[ 7],v[11],v[15]); \ - BLAKE2B_G ((c8),(c9),v[ 0],v[ 5],v[10],v[15]); \ - BLAKE2B_G ((ca),(cb),v[ 1],v[ 6],v[11],v[12]); \ - BLAKE2B_G ((cc),(cd),v[ 2],v[ 7],v[ 8],v[13]); \ - BLAKE2B_G ((ce),(cf),v[ 3],v[ 4],v[ 9],v[14]); \ -} while (0) - -DECLSPEC void blake2b_transform (u64x *h, u64x *t, u64x *f, u64x *m, u64x *v, const u32x *w0, const u32x *w1, const u32x *w2, const u32x *w3, const u32x out_len, const u8 isFinal) -{ - if (isFinal) - f[0] = -1; - - t[0] += hl32_to_64 (0, out_len); - - m[ 0] = hl32_to_64 (w0[1], w0[0]); - m[ 1] = hl32_to_64 (w0[3], w0[2]); - m[ 2] = hl32_to_64 (w1[1], w1[0]); - m[ 3] = hl32_to_64 (w1[3], w1[2]); - m[ 4] = hl32_to_64 (w2[1], w2[0]); - m[ 5] = hl32_to_64 (w2[3], w2[2]); - m[ 6] = hl32_to_64 (w3[1], w3[0]); - m[ 7] = hl32_to_64 (w3[3], w3[2]); - m[ 8] = 0; - m[ 9] = 0; - m[10] = 0; - m[11] = 0; - m[12] = 0; - m[13] = 0; - m[14] = 0; - m[15] = 0; - - v[ 0] = h[0]; - v[ 1] = h[1]; - v[ 2] = h[2]; - v[ 3] = h[3]; - v[ 4] = h[4]; - v[ 5] = h[5]; - v[ 6] = h[6]; - v[ 7] = h[7]; - v[ 8] = BLAKE2B_IV_00; - v[ 9] = BLAKE2B_IV_01; - v[10] = BLAKE2B_IV_02; - v[11] = BLAKE2B_IV_03; - v[12] = BLAKE2B_IV_04 ^ t[0]; - v[13] = BLAKE2B_IV_05 ^ t[1]; - v[14] = BLAKE2B_IV_06 ^ f[0]; - v[15] = BLAKE2B_IV_07 ^ f[1]; - - BLAKE2B_ROUND ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - BLAKE2B_ROUND (14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3); - BLAKE2B_ROUND (11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4); - BLAKE2B_ROUND ( 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8); - BLAKE2B_ROUND ( 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13); - BLAKE2B_ROUND ( 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9); - BLAKE2B_ROUND (12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11); - BLAKE2B_ROUND (13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10); - BLAKE2B_ROUND ( 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5); - BLAKE2B_ROUND (10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0); - BLAKE2B_ROUND ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - BLAKE2B_ROUND (14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3); - - h[0] = h[0] ^ v[0] ^ v[ 8]; - h[1] = h[1] ^ v[1] ^ v[ 9]; - h[2] = h[2] ^ v[2] ^ v[10]; - h[3] = h[3] ^ v[3] ^ v[11]; - h[4] = h[4] ^ v[4] ^ v[12]; - h[5] = h[5] ^ v[5] ^ v[13]; - h[6] = h[6] ^ v[6] ^ v[14]; - h[7] = h[7] ^ v[7] ^ v[15]; -} - -KERNEL_FQ void m00600_m04 (KERN_ATTR_RULES_ESALT (blake2_t)) +KERNEL_FQ void m00600_m04 (KERN_ATTR_RULES ()) { /** - * modifier + * base */ - const u64 lid = get_local_id (0); - const u64 gid = get_global_id (0); if (gid >= gid_max) return; @@ -142,24 +40,6 @@ KERNEL_FQ void m00600_m04 (KERN_ATTR_RULES_ESALT (blake2_t)) const u32 pw_len = pws[gid].pw_len & 63; - u64 tmp_h[8]; - u64 tmp_t[2]; - u64 tmp_f[2]; - - tmp_h[0] = esalt_bufs[digests_offset].h[0]; - tmp_h[1] = esalt_bufs[digests_offset].h[1]; - tmp_h[2] = esalt_bufs[digests_offset].h[2]; - tmp_h[3] = esalt_bufs[digests_offset].h[3]; - tmp_h[4] = esalt_bufs[digests_offset].h[4]; - tmp_h[5] = esalt_bufs[digests_offset].h[5]; - tmp_h[6] = esalt_bufs[digests_offset].h[6]; - tmp_h[7] = esalt_bufs[digests_offset].h[7]; - - tmp_t[0] = esalt_bufs[digests_offset].t[0]; - tmp_t[1] = esalt_bufs[digests_offset].t[1]; - tmp_f[0] = esalt_bufs[digests_offset].f[0]; - tmp_f[1] = esalt_bufs[digests_offset].f[1]; - /** * loop */ @@ -173,64 +53,61 @@ KERNEL_FQ void m00600_m04 (KERN_ATTR_RULES_ESALT (blake2_t)) const u32x out_len = apply_rules_vect_optimized (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); - u64x digest[8]; u64x m[16]; - u64x v[16]; + + m[ 0] = hl32_to_64 (w0[1], w0[0]); + m[ 1] = hl32_to_64 (w0[3], w0[2]); + m[ 2] = hl32_to_64 (w1[1], w1[0]); + m[ 3] = hl32_to_64 (w1[3], w1[2]); + m[ 4] = hl32_to_64 (w2[1], w2[0]); + m[ 5] = hl32_to_64 (w2[3], w2[2]); + m[ 6] = hl32_to_64 (w3[1], w3[0]); + m[ 7] = hl32_to_64 (w3[3], w3[2]); + m[ 8] = 0; + m[ 9] = 0; + m[10] = 0; + m[11] = 0; + m[12] = 0; + m[13] = 0; + m[14] = 0; + m[15] = 0; u64x h[8]; - u64x t[2]; - u64x f[2]; - h[0] = tmp_h[0]; - h[1] = tmp_h[1]; - h[2] = tmp_h[2]; - h[3] = tmp_h[3]; - h[4] = tmp_h[4]; - h[5] = tmp_h[5]; - h[6] = tmp_h[6]; - h[7] = tmp_h[7]; + h[0] = BLAKE2B_IV_00 ^ 0x01010040; + h[1] = BLAKE2B_IV_01; + h[2] = BLAKE2B_IV_02; + h[3] = BLAKE2B_IV_03; + h[4] = BLAKE2B_IV_04; + h[5] = BLAKE2B_IV_05; + h[6] = BLAKE2B_IV_06; + h[7] = BLAKE2B_IV_07; - t[0] = tmp_t[0]; - t[1] = tmp_t[1]; - f[0] = tmp_f[0]; - f[1] = tmp_f[1]; + blake2b_transform_vector (h, m, out_len, BLAKE2B_FINAL); - blake2b_transform (h, t, f, m, v, w0, w1, w2, w3, out_len, BLAKE2B_FINAL); - - digest[0] = h[0]; - digest[1] = h[1]; - digest[2] = h[2]; - digest[3] = h[3]; - digest[4] = h[4]; - digest[5] = h[5]; - digest[6] = h[6]; - digest[7] = h[7]; - - const u32x r0 = h32_from_64 (digest[0]); - const u32x r1 = l32_from_64 (digest[0]); - const u32x r2 = h32_from_64 (digest[1]); - const u32x r3 = l32_from_64 (digest[1]); + const u32x r0 = h32_from_64 (h[0]); + const u32x r1 = l32_from_64 (h[0]); + const u32x r2 = h32_from_64 (h[1]); + const u32x r3 = l32_from_64 (h[1]); COMPARE_M_SIMD (r0, r1, r2, r3); } } -KERNEL_FQ void m00600_m08 (KERN_ATTR_RULES_ESALT (blake2_t)) +KERNEL_FQ void m00600_m08 (KERN_ATTR_RULES ()) { } -KERNEL_FQ void m00600_m16 (KERN_ATTR_RULES_ESALT (blake2_t)) +KERNEL_FQ void m00600_m16 (KERN_ATTR_RULES ()) { } -KERNEL_FQ void m00600_s04 (KERN_ATTR_RULES_ESALT (blake2_t)) +KERNEL_FQ void m00600_s04 (KERN_ATTR_RULES ()) { /** - * modifier + * base */ - const u64 lid = get_local_id (0); - const u64 gid = get_global_id (0); if (gid >= gid_max) return; @@ -249,24 +126,6 @@ KERNEL_FQ void m00600_s04 (KERN_ATTR_RULES_ESALT (blake2_t)) const u32 pw_len = pws[gid].pw_len & 63; - u64 tmp_h[8]; - u64 tmp_t[2]; - u64 tmp_f[2]; - - tmp_h[0] = esalt_bufs[digests_offset].h[0]; - tmp_h[1] = esalt_bufs[digests_offset].h[1]; - tmp_h[2] = esalt_bufs[digests_offset].h[2]; - tmp_h[3] = esalt_bufs[digests_offset].h[3]; - tmp_h[4] = esalt_bufs[digests_offset].h[4]; - tmp_h[5] = esalt_bufs[digests_offset].h[5]; - tmp_h[6] = esalt_bufs[digests_offset].h[6]; - tmp_h[7] = esalt_bufs[digests_offset].h[7]; - - tmp_t[0] = esalt_bufs[digests_offset].t[0]; - tmp_t[1] = esalt_bufs[digests_offset].t[1]; - tmp_f[0] = esalt_bufs[digests_offset].f[0]; - tmp_f[1] = esalt_bufs[digests_offset].f[1]; - /** * digest */ @@ -292,52 +151,51 @@ KERNEL_FQ void m00600_s04 (KERN_ATTR_RULES_ESALT (blake2_t)) const u32x out_len = apply_rules_vect_optimized (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); - u64x digest[8]; u64x m[16]; - u64x v[16]; + + m[ 0] = hl32_to_64 (w0[1], w0[0]); + m[ 1] = hl32_to_64 (w0[3], w0[2]); + m[ 2] = hl32_to_64 (w1[1], w1[0]); + m[ 3] = hl32_to_64 (w1[3], w1[2]); + m[ 4] = hl32_to_64 (w2[1], w2[0]); + m[ 5] = hl32_to_64 (w2[3], w2[2]); + m[ 6] = hl32_to_64 (w3[1], w3[0]); + m[ 7] = hl32_to_64 (w3[3], w3[2]); + m[ 8] = 0; + m[ 9] = 0; + m[10] = 0; + m[11] = 0; + m[12] = 0; + m[13] = 0; + m[14] = 0; + m[15] = 0; u64x h[8]; - u64x t[2]; - u64x f[2]; - h[0] = tmp_h[0]; - h[1] = tmp_h[1]; - h[2] = tmp_h[2]; - h[3] = tmp_h[3]; - h[4] = tmp_h[4]; - h[5] = tmp_h[5]; - h[6] = tmp_h[6]; - h[7] = tmp_h[7]; + h[0] = BLAKE2B_IV_00 ^ 0x01010040; + h[1] = BLAKE2B_IV_01; + h[2] = BLAKE2B_IV_02; + h[3] = BLAKE2B_IV_03; + h[4] = BLAKE2B_IV_04; + h[5] = BLAKE2B_IV_05; + h[6] = BLAKE2B_IV_06; + h[7] = BLAKE2B_IV_07; - t[0] = tmp_t[0]; - t[1] = tmp_t[1]; - f[0] = tmp_f[0]; - f[1] = tmp_f[1]; + blake2b_transform_vector (h, m, out_len, BLAKE2B_FINAL); - blake2b_transform (h, t, f, m, v, w0, w1, w2, w3, out_len, BLAKE2B_FINAL); - - digest[0] = h[0]; - digest[1] = h[1]; - digest[2] = h[2]; - digest[3] = h[3]; - digest[4] = h[4]; - digest[5] = h[5]; - digest[6] = h[6]; - digest[7] = h[7]; - - const u32x r0 = h32_from_64 (digest[0]); - const u32x r1 = l32_from_64 (digest[0]); - const u32x r2 = h32_from_64 (digest[1]); - const u32x r3 = l32_from_64 (digest[1]); + const u32x r0 = h32_from_64 (h[0]); + const u32x r1 = l32_from_64 (h[0]); + const u32x r2 = h32_from_64 (h[1]); + const u32x r3 = l32_from_64 (h[1]); COMPARE_S_SIMD (r0, r1, r2, r3); } } -KERNEL_FQ void m00600_s08 (KERN_ATTR_RULES_ESALT (blake2_t)) +KERNEL_FQ void m00600_s08 (KERN_ATTR_RULES ()) { } -KERNEL_FQ void m00600_s16 (KERN_ATTR_RULES_ESALT (blake2_t)) +KERNEL_FQ void m00600_s16 (KERN_ATTR_RULES ()) { } diff --git a/OpenCL/m00600_a0-pure.cl b/OpenCL/m00600_a0-pure.cl new file mode 100644 index 000000000..07dd567f0 --- /dev/null +++ b/OpenCL/m00600_a0-pure.cl @@ -0,0 +1,111 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#ifdef KERNEL_STATIC +#include "inc_vendor.h" +#include "inc_types.h" +#include "inc_platform.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_blake2b.cl" +#endif + +KERNEL_FQ void m00600_mxx (KERN_ATTR_RULES ()) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + COPY_PW (pws[gid]); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + pw_t tmp = PASTE_PW; + + tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len); + + blake2b_ctx_t ctx; + + blake2b_init (&ctx); + blake2b_update (&ctx, tmp.i, tmp.pw_len); + blake2b_final (&ctx); + + const u32 r0 = h32_from_64_S (ctx.h[0]); + const u32 r1 = l32_from_64_S (ctx.h[0]); + const u32 r2 = h32_from_64_S (ctx.h[1]); + const u32 r3 = l32_from_64_S (ctx.h[1]); + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +KERNEL_FQ void m00600_sxx (KERN_ATTR_RULES ()) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + COPY_PW (pws[gid]); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + pw_t tmp = PASTE_PW; + + tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len); + + blake2b_ctx_t ctx; + + blake2b_init (&ctx); + blake2b_update (&ctx, tmp.i, tmp.pw_len); + blake2b_final (&ctx); + + const u32 r0 = h32_from_64_S (ctx.h[0]); + const u32 r1 = l32_from_64_S (ctx.h[0]); + const u32 r2 = h32_from_64_S (ctx.h[1]); + const u32 r3 = l32_from_64_S (ctx.h[1]); + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m00600_a1-optimized.cl b/OpenCL/m00600_a1-optimized.cl index dea787469..64c852492 100644 --- a/OpenCL/m00600_a1-optimized.cl +++ b/OpenCL/m00600_a1-optimized.cl @@ -11,117 +11,18 @@ #include "inc_platform.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_blake2b.cl" #endif -typedef struct blake2 -{ - u64 h[8]; - u64 t[2]; - u64 f[2]; - u32 buflen; - u32 outlen; - -} blake2_t; - -#define BLAKE2B_FINAL 1 -#define BLAKE2B_UPDATE 0 - -#define BLAKE2B_G(k0,k1,a,b,c,d) \ - do { \ - a = a + b + m[(k0)]; \ - d = hc_rotr64 (d ^ a, 32); \ - c = c + d; \ - b = hc_rotr64 (b ^ c, 24); \ - a = a + b + m[(k1)]; \ - d = hc_rotr64 (d ^ a, 16); \ - c = c + d; \ - b = hc_rotr64 (b ^ c, 63); \ - } while (0) - -#define BLAKE2B_ROUND(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,ca,cb,cc,cd,ce,cf) \ - do { \ - BLAKE2B_G ((c0),(c1),v[ 0],v[ 4],v[ 8],v[12]); \ - BLAKE2B_G ((c2),(c3),v[ 1],v[ 5],v[ 9],v[13]); \ - BLAKE2B_G ((c4),(c5),v[ 2],v[ 6],v[10],v[14]); \ - BLAKE2B_G ((c6),(c7),v[ 3],v[ 7],v[11],v[15]); \ - BLAKE2B_G ((c8),(c9),v[ 0],v[ 5],v[10],v[15]); \ - BLAKE2B_G ((ca),(cb),v[ 1],v[ 6],v[11],v[12]); \ - BLAKE2B_G ((cc),(cd),v[ 2],v[ 7],v[ 8],v[13]); \ - BLAKE2B_G ((ce),(cf),v[ 3],v[ 4],v[ 9],v[14]); \ -} while (0) - -DECLSPEC void blake2b_transform (u64x *h, u64x *t, u64x *f, u64x *m, u64x *v, const u32x *w0, const u32x *w1, const u32x *w2, const u32x *w3, const u32x out_len, const u8 isFinal) -{ - if (isFinal) - f[0] = -1; - - t[0] += hl32_to_64 (0, out_len); - - m[ 0] = hl32_to_64 (w0[1], w0[0]); - m[ 1] = hl32_to_64 (w0[3], w0[2]); - m[ 2] = hl32_to_64 (w1[1], w1[0]); - m[ 3] = hl32_to_64 (w1[3], w1[2]); - m[ 4] = hl32_to_64 (w2[1], w2[0]); - m[ 5] = hl32_to_64 (w2[3], w2[2]); - m[ 6] = hl32_to_64 (w3[1], w3[0]); - m[ 7] = hl32_to_64 (w3[3], w3[2]); - m[ 8] = 0; - m[ 9] = 0; - m[10] = 0; - m[11] = 0; - m[12] = 0; - m[13] = 0; - m[14] = 0; - m[15] = 0; - - v[ 0] = h[0]; - v[ 1] = h[1]; - v[ 2] = h[2]; - v[ 3] = h[3]; - v[ 4] = h[4]; - v[ 5] = h[5]; - v[ 6] = h[6]; - v[ 7] = h[7]; - v[ 8] = BLAKE2B_IV_00; - v[ 9] = BLAKE2B_IV_01; - v[10] = BLAKE2B_IV_02; - v[11] = BLAKE2B_IV_03; - v[12] = BLAKE2B_IV_04 ^ t[0]; - v[13] = BLAKE2B_IV_05 ^ t[1]; - v[14] = BLAKE2B_IV_06 ^ f[0]; - v[15] = BLAKE2B_IV_07 ^ f[1]; - - BLAKE2B_ROUND ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - BLAKE2B_ROUND (14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3); - BLAKE2B_ROUND (11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4); - BLAKE2B_ROUND ( 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8); - BLAKE2B_ROUND ( 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13); - BLAKE2B_ROUND ( 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9); - BLAKE2B_ROUND (12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11); - BLAKE2B_ROUND (13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10); - BLAKE2B_ROUND ( 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5); - BLAKE2B_ROUND (10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0); - BLAKE2B_ROUND ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - BLAKE2B_ROUND (14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3); - - h[0] = h[0] ^ v[0] ^ v[ 8]; - h[1] = h[1] ^ v[1] ^ v[ 9]; - h[2] = h[2] ^ v[2] ^ v[10]; - h[3] = h[3] ^ v[3] ^ v[11]; - h[4] = h[4] ^ v[4] ^ v[12]; - h[5] = h[5] ^ v[5] ^ v[13]; - h[6] = h[6] ^ v[6] ^ v[14]; - h[7] = h[7] ^ v[7] ^ v[15]; -} - -KERNEL_FQ void m00600_m04 (KERN_ATTR_ESALT (blake2_t)) +KERNEL_FQ void m00600_m04 (KERN_ATTR_BASIC ()) { /** - * modifier + * base */ const u64 gid = get_global_id (0); - const u64 lid = get_local_id (0); + + if (gid >= gid_max) return; u32 pw_buf0[4]; u32 pw_buf1[4]; @@ -137,24 +38,6 @@ KERNEL_FQ void m00600_m04 (KERN_ATTR_ESALT (blake2_t)) const u32 pw_l_len = pws[gid].pw_len & 63; - u64 tmp_h[8]; - u64 tmp_t[2]; - u64 tmp_f[2]; - - tmp_h[0] = esalt_bufs[digests_offset].h[0]; - tmp_h[1] = esalt_bufs[digests_offset].h[1]; - tmp_h[2] = esalt_bufs[digests_offset].h[2]; - tmp_h[3] = esalt_bufs[digests_offset].h[3]; - tmp_h[4] = esalt_bufs[digests_offset].h[4]; - tmp_h[5] = esalt_bufs[digests_offset].h[5]; - tmp_h[6] = esalt_bufs[digests_offset].h[6]; - tmp_h[7] = esalt_bufs[digests_offset].h[7]; - - tmp_t[0] = esalt_bufs[digests_offset].t[0]; - tmp_t[1] = esalt_bufs[digests_offset].t[1]; - tmp_f[0] = esalt_bufs[digests_offset].f[0]; - tmp_f[1] = esalt_bufs[digests_offset].f[1]; - /** * loop */ @@ -228,64 +111,61 @@ KERNEL_FQ void m00600_m04 (KERN_ATTR_ESALT (blake2_t)) w3[2] = wordl3[2] | wordr3[2]; w3[3] = wordl3[3] | wordr3[3]; - u64x digest[8]; u64x m[16]; - u64x v[16]; + + m[ 0] = hl32_to_64 (w0[1], w0[0]); + m[ 1] = hl32_to_64 (w0[3], w0[2]); + m[ 2] = hl32_to_64 (w1[1], w1[0]); + m[ 3] = hl32_to_64 (w1[3], w1[2]); + m[ 4] = hl32_to_64 (w2[1], w2[0]); + m[ 5] = hl32_to_64 (w2[3], w2[2]); + m[ 6] = hl32_to_64 (w3[1], w3[0]); + m[ 7] = hl32_to_64 (w3[3], w3[2]); + m[ 8] = 0; + m[ 9] = 0; + m[10] = 0; + m[11] = 0; + m[12] = 0; + m[13] = 0; + m[14] = 0; + m[15] = 0; u64x h[8]; - u64x t[2]; - u64x f[2]; - h[0] = tmp_h[0]; - h[1] = tmp_h[1]; - h[2] = tmp_h[2]; - h[3] = tmp_h[3]; - h[4] = tmp_h[4]; - h[5] = tmp_h[5]; - h[6] = tmp_h[6]; - h[7] = tmp_h[7]; + h[0] = BLAKE2B_IV_00 ^ 0x01010040; + h[1] = BLAKE2B_IV_01; + h[2] = BLAKE2B_IV_02; + h[3] = BLAKE2B_IV_03; + h[4] = BLAKE2B_IV_04; + h[5] = BLAKE2B_IV_05; + h[6] = BLAKE2B_IV_06; + h[7] = BLAKE2B_IV_07; - t[0] = tmp_t[0]; - t[1] = tmp_t[1]; - f[0] = tmp_f[0]; - f[1] = tmp_f[1]; + blake2b_transform_vector (h, m, out_len, BLAKE2B_FINAL); - blake2b_transform (h, t, f, m, v, w0, w1, w2, w3, out_len, BLAKE2B_FINAL); - - digest[0] = h[0]; - digest[1] = h[1]; - digest[2] = h[2]; - digest[3] = h[3]; - digest[4] = h[4]; - digest[5] = h[5]; - digest[6] = h[6]; - digest[7] = h[7]; - - const u32x r0 = h32_from_64 (digest[0]); - const u32x r1 = l32_from_64 (digest[0]); - const u32x r2 = h32_from_64 (digest[1]); - const u32x r3 = l32_from_64 (digest[1]); + const u32x r0 = h32_from_64 (h[0]); + const u32x r1 = l32_from_64 (h[0]); + const u32x r2 = h32_from_64 (h[1]); + const u32x r3 = l32_from_64 (h[1]); COMPARE_M_SIMD (r0, r1, r2, r3); } } -KERNEL_FQ void m00600_m08 (KERN_ATTR_ESALT (blake2_t)) +KERNEL_FQ void m00600_m08 (KERN_ATTR_BASIC ()) { } -KERNEL_FQ void m00600_m16 (KERN_ATTR_ESALT (blake2_t)) +KERNEL_FQ void m00600_m16 (KERN_ATTR_BASIC ()) { } -KERNEL_FQ void m00600_s04 (KERN_ATTR_ESALT (blake2_t)) +KERNEL_FQ void m00600_s04 (KERN_ATTR_BASIC ()) { /** - * modifier + * base */ - const u64 lid = get_local_id (0); - const u64 gid = get_global_id (0); if (gid >= gid_max) return; @@ -304,24 +184,6 @@ KERNEL_FQ void m00600_s04 (KERN_ATTR_ESALT (blake2_t)) const u32 pw_l_len = pws[gid].pw_len & 63; - u64 tmp_h[8]; - u64 tmp_t[2]; - u64 tmp_f[2]; - - tmp_h[0] = esalt_bufs[digests_offset].h[0]; - tmp_h[1] = esalt_bufs[digests_offset].h[1]; - tmp_h[2] = esalt_bufs[digests_offset].h[2]; - tmp_h[3] = esalt_bufs[digests_offset].h[3]; - tmp_h[4] = esalt_bufs[digests_offset].h[4]; - tmp_h[5] = esalt_bufs[digests_offset].h[5]; - tmp_h[6] = esalt_bufs[digests_offset].h[6]; - tmp_h[7] = esalt_bufs[digests_offset].h[7]; - - tmp_t[0] = esalt_bufs[digests_offset].t[0]; - tmp_t[1] = esalt_bufs[digests_offset].t[1]; - tmp_f[0] = esalt_bufs[digests_offset].f[0]; - tmp_f[1] = esalt_bufs[digests_offset].f[1]; - /** * digest */ @@ -407,52 +269,51 @@ KERNEL_FQ void m00600_s04 (KERN_ATTR_ESALT (blake2_t)) w3[2] = wordl3[2] | wordr3[2]; w3[3] = wordl3[3] | wordr3[3]; - u64x digest[8]; u64x m[16]; - u64x v[16]; + + m[ 0] = hl32_to_64 (w0[1], w0[0]); + m[ 1] = hl32_to_64 (w0[3], w0[2]); + m[ 2] = hl32_to_64 (w1[1], w1[0]); + m[ 3] = hl32_to_64 (w1[3], w1[2]); + m[ 4] = hl32_to_64 (w2[1], w2[0]); + m[ 5] = hl32_to_64 (w2[3], w2[2]); + m[ 6] = hl32_to_64 (w3[1], w3[0]); + m[ 7] = hl32_to_64 (w3[3], w3[2]); + m[ 8] = 0; + m[ 9] = 0; + m[10] = 0; + m[11] = 0; + m[12] = 0; + m[13] = 0; + m[14] = 0; + m[15] = 0; u64x h[8]; - u64x t[2]; - u64x f[2]; - h[0] = tmp_h[0]; - h[1] = tmp_h[1]; - h[2] = tmp_h[2]; - h[3] = tmp_h[3]; - h[4] = tmp_h[4]; - h[5] = tmp_h[5]; - h[6] = tmp_h[6]; - h[7] = tmp_h[7]; + h[0] = BLAKE2B_IV_00 ^ 0x01010040; + h[1] = BLAKE2B_IV_01; + h[2] = BLAKE2B_IV_02; + h[3] = BLAKE2B_IV_03; + h[4] = BLAKE2B_IV_04; + h[5] = BLAKE2B_IV_05; + h[6] = BLAKE2B_IV_06; + h[7] = BLAKE2B_IV_07; - t[0] = tmp_t[0]; - t[1] = tmp_t[1]; - f[0] = tmp_f[0]; - f[1] = tmp_f[1]; + blake2b_transform_vector (h, m, out_len, BLAKE2B_FINAL); - blake2b_transform (h, t, f, m, v, w0, w1, w2, w3, out_len, BLAKE2B_FINAL); - - digest[0] = h[0]; - digest[1] = h[1]; - digest[2] = h[2]; - digest[3] = h[3]; - digest[4] = h[4]; - digest[5] = h[5]; - digest[6] = h[6]; - digest[7] = h[7]; - - const u32x r0 = h32_from_64 (digest[0]); - const u32x r1 = l32_from_64 (digest[0]); - const u32x r2 = h32_from_64 (digest[1]); - const u32x r3 = l32_from_64 (digest[1]); + const u32x r0 = h32_from_64 (h[0]); + const u32x r1 = l32_from_64 (h[0]); + const u32x r2 = h32_from_64 (h[1]); + const u32x r3 = l32_from_64 (h[1]); COMPARE_S_SIMD (r0, r1, r2, r3); } } -KERNEL_FQ void m00600_s08 (KERN_ATTR_ESALT (blake2_t)) +KERNEL_FQ void m00600_s08 (KERN_ATTR_BASIC ()) { } -KERNEL_FQ void m00600_s16 (KERN_ATTR_ESALT (blake2_t)) +KERNEL_FQ void m00600_s16 (KERN_ATTR_BASIC ()) { } diff --git a/OpenCL/m00600_a1-pure.cl b/OpenCL/m00600_a1-pure.cl new file mode 100644 index 000000000..4cc7c9707 --- /dev/null +++ b/OpenCL/m00600_a1-pure.cl @@ -0,0 +1,109 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#ifdef KERNEL_STATIC +#include "inc_vendor.h" +#include "inc_types.h" +#include "inc_platform.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_blake2b.cl" +#endif + +KERNEL_FQ void m00600_mxx (KERN_ATTR_BASIC ()) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + blake2b_ctx_t ctx0; + + blake2b_init (&ctx0); + + blake2b_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + blake2b_ctx_t ctx = ctx0; + + blake2b_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + blake2b_final (&ctx); + + const u32 r0 = h32_from_64_S (ctx.h[0]); + const u32 r1 = l32_from_64_S (ctx.h[0]); + const u32 r2 = h32_from_64_S (ctx.h[1]); + const u32 r3 = l32_from_64_S (ctx.h[1]); + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +KERNEL_FQ void m00600_sxx (KERN_ATTR_BASIC ()) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + blake2b_ctx_t ctx0; + + blake2b_init (&ctx0); + + blake2b_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + blake2b_ctx_t ctx = ctx0; + + blake2b_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + blake2b_final (&ctx); + + const u32 r0 = h32_from_64_S (ctx.h[0]); + const u32 r1 = l32_from_64_S (ctx.h[0]); + const u32 r2 = h32_from_64_S (ctx.h[1]); + const u32 r3 = l32_from_64_S (ctx.h[1]); + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m00600_a3-optimized.cl b/OpenCL/m00600_a3-optimized.cl index 2fa9e46b6..20f9e7327 100644 --- a/OpenCL/m00600_a3-optimized.cl +++ b/OpenCL/m00600_a3-optimized.cl @@ -11,141 +11,22 @@ #include "inc_platform.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_blake2b.cl" #endif -typedef struct blake2 -{ - u64 h[8]; - u64 t[2]; - u64 f[2]; - u32 buflen; - u32 outlen; - -} blake2_t; - -#define BLAKE2B_FINAL 1 -#define BLAKE2B_UPDATE 0 - -#define BLAKE2B_G(k0,k1,a,b,c,d) \ - do { \ - a = a + b + m[(k0)]; \ - d = hc_rotr64 (d ^ a, 32); \ - c = c + d; \ - b = hc_rotr64 (b ^ c, 24); \ - a = a + b + m[(k1)]; \ - d = hc_rotr64 (d ^ a, 16); \ - c = c + d; \ - b = hc_rotr64 (b ^ c, 63); \ - } while (0) - -#define BLAKE2B_ROUND(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,ca,cb,cc,cd,ce,cf) \ - do { \ - BLAKE2B_G ((c0),(c1),v[ 0],v[ 4],v[ 8],v[12]); \ - BLAKE2B_G ((c2),(c3),v[ 1],v[ 5],v[ 9],v[13]); \ - BLAKE2B_G ((c4),(c5),v[ 2],v[ 6],v[10],v[14]); \ - BLAKE2B_G ((c6),(c7),v[ 3],v[ 7],v[11],v[15]); \ - BLAKE2B_G ((c8),(c9),v[ 0],v[ 5],v[10],v[15]); \ - BLAKE2B_G ((ca),(cb),v[ 1],v[ 6],v[11],v[12]); \ - BLAKE2B_G ((cc),(cd),v[ 2],v[ 7],v[ 8],v[13]); \ - BLAKE2B_G ((ce),(cf),v[ 3],v[ 4],v[ 9],v[14]); \ -} while (0) - -DECLSPEC void blake2b_transform (u64x *h, u64x *t, u64x *f, u64x *m, u64x *v, const u32x *w0, const u32x *w1, const u32x *w2, const u32x *w3, const u32x out_len, const u8 isFinal) -{ - if (isFinal) - f[0] = -1; - - t[0] += hl32_to_64 (0, out_len); - - m[ 0] = hl32_to_64 (w0[1], w0[0]); - m[ 1] = hl32_to_64 (w0[3], w0[2]); - m[ 2] = hl32_to_64 (w1[1], w1[0]); - m[ 3] = hl32_to_64 (w1[3], w1[2]); - m[ 4] = hl32_to_64 (w2[1], w2[0]); - m[ 5] = hl32_to_64 (w2[3], w2[2]); - m[ 6] = hl32_to_64 (w3[1], w3[0]); - m[ 7] = hl32_to_64 (w3[3], w3[2]); - m[ 8] = 0; - m[ 9] = 0; - m[10] = 0; - m[11] = 0; - m[12] = 0; - m[13] = 0; - m[14] = 0; - m[15] = 0; - - v[ 0] = h[0]; - v[ 1] = h[1]; - v[ 2] = h[2]; - v[ 3] = h[3]; - v[ 4] = h[4]; - v[ 5] = h[5]; - v[ 6] = h[6]; - v[ 7] = h[7]; - v[ 8] = BLAKE2B_IV_00; - v[ 9] = BLAKE2B_IV_01; - v[10] = BLAKE2B_IV_02; - v[11] = BLAKE2B_IV_03; - v[12] = BLAKE2B_IV_04 ^ t[0]; - v[13] = BLAKE2B_IV_05 ^ t[1]; - v[14] = BLAKE2B_IV_06 ^ f[0]; - v[15] = BLAKE2B_IV_07 ^ f[1]; - - BLAKE2B_ROUND ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - BLAKE2B_ROUND (14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3); - BLAKE2B_ROUND (11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4); - BLAKE2B_ROUND ( 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8); - BLAKE2B_ROUND ( 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13); - BLAKE2B_ROUND ( 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9); - BLAKE2B_ROUND (12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11); - BLAKE2B_ROUND (13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10); - BLAKE2B_ROUND ( 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5); - BLAKE2B_ROUND (10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0); - BLAKE2B_ROUND ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - BLAKE2B_ROUND (14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3); - - h[0] = h[0] ^ v[0] ^ v[ 8]; - h[1] = h[1] ^ v[1] ^ v[ 9]; - h[2] = h[2] ^ v[2] ^ v[10]; - h[3] = h[3] ^ v[3] ^ v[11]; - h[4] = h[4] ^ v[4] ^ v[12]; - h[5] = h[5] ^ v[5] ^ v[13]; - h[6] = h[6] ^ v[6] ^ v[14]; - h[7] = h[7] ^ v[7] ^ v[15]; -} - -KERNEL_FQ void m00600_m04 (KERN_ATTR_VECTOR_ESALT (blake2_t)) +DECLSPEC void m00600m (u32 *w, const u32 pw_len, KERN_ATTR_VECTOR ()) { /** * modifier */ const u64 gid = get_global_id (0); - const u64 lid = get_local_id (0); - - u64 tmp_h[8]; - u64 tmp_t[2]; - u64 tmp_f[2]; - - tmp_h[0] = esalt_bufs[digests_offset].h[0]; - tmp_h[1] = esalt_bufs[digests_offset].h[1]; - tmp_h[2] = esalt_bufs[digests_offset].h[2]; - tmp_h[3] = esalt_bufs[digests_offset].h[3]; - tmp_h[4] = esalt_bufs[digests_offset].h[4]; - tmp_h[5] = esalt_bufs[digests_offset].h[5]; - tmp_h[6] = esalt_bufs[digests_offset].h[6]; - tmp_h[7] = esalt_bufs[digests_offset].h[7]; - - tmp_t[0] = esalt_bufs[digests_offset].t[0]; - tmp_t[1] = esalt_bufs[digests_offset].t[1]; - tmp_f[0] = esalt_bufs[digests_offset].f[0]; - tmp_f[1] = esalt_bufs[digests_offset].f[1]; /** * loop */ - u32 w0l = pws[gid].i[0]; + u32 w0l = w[0]; for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { @@ -158,300 +39,70 @@ KERNEL_FQ void m00600_m04 (KERN_ATTR_VECTOR_ESALT (blake2_t)) u32x w3[4]; w0[0] = w0x; - w0[1] = pws[gid].i[ 1]; - w0[2] = pws[gid].i[ 2]; - w0[3] = pws[gid].i[ 3]; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + w0[1] = w[ 1]; + w0[2] = w[ 2]; + w0[3] = w[ 3]; + w1[0] = w[ 4]; + w1[1] = w[ 5]; + w1[2] = w[ 6]; + w1[3] = w[ 7]; + w2[0] = w[ 8]; + w2[1] = w[ 9]; + w2[2] = w[10]; + w2[3] = w[11]; + w3[0] = w[12]; + w3[1] = w[13]; + w3[2] = w[14]; + w3[3] = w[15]; - u32x out_len = pws[gid].pw_len; - - u64x digest[8]; u64x m[16]; - u64x v[16]; + + m[ 0] = hl32_to_64 (w0[1], w0[0]); + m[ 1] = hl32_to_64 (w0[3], w0[2]); + m[ 2] = hl32_to_64 (w1[1], w1[0]); + m[ 3] = hl32_to_64 (w1[3], w1[2]); + m[ 4] = hl32_to_64 (w2[1], w2[0]); + m[ 5] = hl32_to_64 (w2[3], w2[2]); + m[ 6] = hl32_to_64 (w3[1], w3[0]); + m[ 7] = hl32_to_64 (w3[3], w3[2]); + m[ 8] = 0; + m[ 9] = 0; + m[10] = 0; + m[11] = 0; + m[12] = 0; + m[13] = 0; + m[14] = 0; + m[15] = 0; u64x h[8]; - u64x t[2]; - u64x f[2]; - h[0] = tmp_h[0]; - h[1] = tmp_h[1]; - h[2] = tmp_h[2]; - h[3] = tmp_h[3]; - h[4] = tmp_h[4]; - h[5] = tmp_h[5]; - h[6] = tmp_h[6]; - h[7] = tmp_h[7]; + h[0] = BLAKE2B_IV_00 ^ 0x01010040; + h[1] = BLAKE2B_IV_01; + h[2] = BLAKE2B_IV_02; + h[3] = BLAKE2B_IV_03; + h[4] = BLAKE2B_IV_04; + h[5] = BLAKE2B_IV_05; + h[6] = BLAKE2B_IV_06; + h[7] = BLAKE2B_IV_07; - t[0] = tmp_t[0]; - t[1] = tmp_t[1]; - f[0] = tmp_f[0]; - f[1] = tmp_f[1]; + blake2b_transform_vector (h, m, pw_len, BLAKE2B_FINAL); - blake2b_transform (h, t, f, m, v, w0, w1, w2, w3, out_len, BLAKE2B_FINAL); - - digest[0] = h[0]; - digest[1] = h[1]; - digest[2] = h[2]; - digest[3] = h[3]; - digest[4] = h[4]; - digest[5] = h[5]; - digest[6] = h[6]; - digest[7] = h[7]; - - const u32x r0 = h32_from_64 (digest[0]); - const u32x r1 = l32_from_64 (digest[0]); - const u32x r2 = h32_from_64 (digest[1]); - const u32x r3 = l32_from_64 (digest[1]); + const u32x r0 = h32_from_64 (h[0]); + const u32x r1 = l32_from_64 (h[0]); + const u32x r2 = h32_from_64 (h[1]); + const u32x r3 = l32_from_64 (h[1]); COMPARE_M_SIMD (r0, r1, r2, r3); } } -KERNEL_FQ void m00600_m08 (KERN_ATTR_VECTOR_ESALT (blake2_t)) +DECLSPEC void m00600s (u32 *w, const u32 pw_len, KERN_ATTR_VECTOR ()) { /** * modifier */ const u64 gid = get_global_id (0); - const u64 lid = get_local_id (0); - - u64 tmp_h[8]; - u64 tmp_t[2]; - u64 tmp_f[2]; - - tmp_h[0] = esalt_bufs[digests_offset].h[0]; - tmp_h[1] = esalt_bufs[digests_offset].h[1]; - tmp_h[2] = esalt_bufs[digests_offset].h[2]; - tmp_h[3] = esalt_bufs[digests_offset].h[3]; - tmp_h[4] = esalt_bufs[digests_offset].h[4]; - tmp_h[5] = esalt_bufs[digests_offset].h[5]; - tmp_h[6] = esalt_bufs[digests_offset].h[6]; - tmp_h[7] = esalt_bufs[digests_offset].h[7]; - - tmp_t[0] = esalt_bufs[digests_offset].t[0]; - tmp_t[1] = esalt_bufs[digests_offset].t[1]; - tmp_f[0] = esalt_bufs[digests_offset].f[0]; - tmp_f[1] = esalt_bufs[digests_offset].f[1]; - - /** - * loop - */ - - u32 w0l = pws[gid].i[0]; - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32x w0x = w0l | w0r; - - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; - - w0[0] = w0x; - w0[1] = pws[gid].i[ 1]; - w0[2] = pws[gid].i[ 2]; - w0[3] = pws[gid].i[ 3]; - w1[0] = pws[gid].i[ 4]; - w1[1] = pws[gid].i[ 5]; - w1[2] = pws[gid].i[ 6]; - w1[3] = pws[gid].i[ 7]; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; - - u32x out_len = pws[gid].pw_len; - - u64x digest[8]; - u64x m[16]; - u64x v[16]; - - u64x h[8]; - u64x t[2]; - u64x f[2]; - - h[0] = tmp_h[0]; - h[1] = tmp_h[1]; - h[2] = tmp_h[2]; - h[3] = tmp_h[3]; - h[4] = tmp_h[4]; - h[5] = tmp_h[5]; - h[6] = tmp_h[6]; - h[7] = tmp_h[7]; - - t[0] = tmp_t[0]; - t[1] = tmp_t[1]; - f[0] = tmp_f[0]; - f[1] = tmp_f[1]; - - blake2b_transform (h, t, f, m, v, w0, w1, w2, w3, out_len, BLAKE2B_FINAL); - - digest[0] = h[0]; - digest[1] = h[1]; - digest[2] = h[2]; - digest[3] = h[3]; - digest[4] = h[4]; - digest[5] = h[5]; - digest[6] = h[6]; - digest[7] = h[7]; - - const u32x r0 = h32_from_64 (digest[0]); - const u32x r1 = l32_from_64 (digest[0]); - const u32x r2 = h32_from_64 (digest[1]); - const u32x r3 = l32_from_64 (digest[1]); - - COMPARE_M_SIMD (r0, r1, r2, r3); - } -} - -KERNEL_FQ void m00600_m16 (KERN_ATTR_VECTOR_ESALT (blake2_t)) -{ - /** - * modifier - */ - - const u64 gid = get_global_id (0); - const u64 lid = get_local_id (0); - - u64 tmp_h[8]; - u64 tmp_t[2]; - u64 tmp_f[2]; - - tmp_h[0] = esalt_bufs[digests_offset].h[0]; - tmp_h[1] = esalt_bufs[digests_offset].h[1]; - tmp_h[2] = esalt_bufs[digests_offset].h[2]; - tmp_h[3] = esalt_bufs[digests_offset].h[3]; - tmp_h[4] = esalt_bufs[digests_offset].h[4]; - tmp_h[5] = esalt_bufs[digests_offset].h[5]; - tmp_h[6] = esalt_bufs[digests_offset].h[6]; - tmp_h[7] = esalt_bufs[digests_offset].h[7]; - - tmp_t[0] = esalt_bufs[digests_offset].t[0]; - tmp_t[1] = esalt_bufs[digests_offset].t[1]; - tmp_f[0] = esalt_bufs[digests_offset].f[0]; - tmp_f[1] = esalt_bufs[digests_offset].f[1]; - - /** - * loop - */ - - u32 w0l = pws[gid].i[0]; - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32x w0x = w0l | w0r; - - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; - - w0[0] = w0x; - w0[1] = pws[gid].i[ 1]; - w0[2] = pws[gid].i[ 2]; - w0[3] = pws[gid].i[ 3]; - w1[0] = pws[gid].i[ 4]; - w1[1] = pws[gid].i[ 5]; - w1[2] = pws[gid].i[ 6]; - w1[3] = pws[gid].i[ 7]; - w2[0] = pws[gid].i[ 8]; - w2[1] = pws[gid].i[ 9]; - w2[2] = pws[gid].i[10]; - w2[3] = pws[gid].i[11]; - w3[0] = pws[gid].i[12]; - w3[1] = pws[gid].i[13]; - w3[2] = pws[gid].i[14]; - w3[3] = pws[gid].i[15]; - - u32x out_len = pws[gid].pw_len; - - u64x digest[8]; - u64x m[16]; - u64x v[16]; - - u64x h[8]; - u64x t[2]; - u64x f[2]; - - h[0] = tmp_h[0]; - h[1] = tmp_h[1]; - h[2] = tmp_h[2]; - h[3] = tmp_h[3]; - h[4] = tmp_h[4]; - h[5] = tmp_h[5]; - h[6] = tmp_h[6]; - h[7] = tmp_h[7]; - - t[0] = tmp_t[0]; - t[1] = tmp_t[1]; - f[0] = tmp_f[0]; - f[1] = tmp_f[1]; - - blake2b_transform (h, t, f, m, v, w0, w1, w2, w3, out_len, BLAKE2B_FINAL); - - digest[0] = h[0]; - digest[1] = h[1]; - digest[2] = h[2]; - digest[3] = h[3]; - digest[4] = h[4]; - digest[5] = h[5]; - digest[6] = h[6]; - digest[7] = h[7]; - - const u32x r0 = h32_from_64 (digest[0]); - const u32x r1 = l32_from_64 (digest[0]); - const u32x r2 = h32_from_64 (digest[1]); - const u32x r3 = l32_from_64 (digest[1]); - - COMPARE_M_SIMD (r0, r1, r2, r3); - } -} - -KERNEL_FQ void m00600_s04 (KERN_ATTR_VECTOR_ESALT (blake2_t)) -{ - /** - * modifier - */ - - const u64 gid = get_global_id (0); - const u64 lid = get_local_id (0); - - u64 tmp_h[8]; - u64 tmp_t[2]; - u64 tmp_f[2]; - - tmp_h[0] = esalt_bufs[digests_offset].h[0]; - tmp_h[1] = esalt_bufs[digests_offset].h[1]; - tmp_h[2] = esalt_bufs[digests_offset].h[2]; - tmp_h[3] = esalt_bufs[digests_offset].h[3]; - tmp_h[4] = esalt_bufs[digests_offset].h[4]; - tmp_h[5] = esalt_bufs[digests_offset].h[5]; - tmp_h[6] = esalt_bufs[digests_offset].h[6]; - tmp_h[7] = esalt_bufs[digests_offset].h[7]; - - tmp_t[0] = esalt_bufs[digests_offset].t[0]; - tmp_t[1] = esalt_bufs[digests_offset].t[1]; - tmp_f[0] = esalt_bufs[digests_offset].f[0]; - tmp_f[1] = esalt_bufs[digests_offset].f[1]; /** * digest @@ -469,7 +120,7 @@ KERNEL_FQ void m00600_s04 (KERN_ATTR_VECTOR_ESALT (blake2_t)) * loop */ - u32 w0l = pws[gid].i[0]; + u32 w0l = w[0]; for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { @@ -482,294 +133,287 @@ KERNEL_FQ void m00600_s04 (KERN_ATTR_VECTOR_ESALT (blake2_t)) u32x w3[4]; w0[0] = w0x; - w0[1] = pws[gid].i[ 1]; - w0[2] = pws[gid].i[ 2]; - w0[3] = pws[gid].i[ 3]; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; + w0[1] = w[ 1]; + w0[2] = w[ 2]; + w0[3] = w[ 3]; + w1[0] = w[ 4]; + w1[1] = w[ 5]; + w1[2] = w[ 6]; + w1[3] = w[ 7]; + w2[0] = w[ 8]; + w2[1] = w[ 9]; + w2[2] = w[10]; + w2[3] = w[11]; + w3[0] = w[12]; + w3[1] = w[13]; + w3[2] = w[14]; + w3[3] = w[15]; - u32x out_len = pws[gid].pw_len; - - u64x digest[8]; u64x m[16]; - u64x v[16]; + + m[ 0] = hl32_to_64 (w0[1], w0[0]); + m[ 1] = hl32_to_64 (w0[3], w0[2]); + m[ 2] = hl32_to_64 (w1[1], w1[0]); + m[ 3] = hl32_to_64 (w1[3], w1[2]); + m[ 4] = hl32_to_64 (w2[1], w2[0]); + m[ 5] = hl32_to_64 (w2[3], w2[2]); + m[ 6] = hl32_to_64 (w3[1], w3[0]); + m[ 7] = hl32_to_64 (w3[3], w3[2]); + m[ 8] = 0; + m[ 9] = 0; + m[10] = 0; + m[11] = 0; + m[12] = 0; + m[13] = 0; + m[14] = 0; + m[15] = 0; u64x h[8]; - u64x t[2]; - u64x f[2]; - h[0] = tmp_h[0]; - h[1] = tmp_h[1]; - h[2] = tmp_h[2]; - h[3] = tmp_h[3]; - h[4] = tmp_h[4]; - h[5] = tmp_h[5]; - h[6] = tmp_h[6]; - h[7] = tmp_h[7]; + h[0] = BLAKE2B_IV_00 ^ 0x01010040; + h[1] = BLAKE2B_IV_01; + h[2] = BLAKE2B_IV_02; + h[3] = BLAKE2B_IV_03; + h[4] = BLAKE2B_IV_04; + h[5] = BLAKE2B_IV_05; + h[6] = BLAKE2B_IV_06; + h[7] = BLAKE2B_IV_07; - t[0] = tmp_t[0]; - t[1] = tmp_t[1]; - f[0] = tmp_f[0]; - f[1] = tmp_f[1]; + blake2b_transform_vector (h, m, pw_len, BLAKE2B_FINAL); - blake2b_transform (h, t, f, m, v, w0, w1, w2, w3, out_len, BLAKE2B_FINAL); - - digest[0] = h[0]; - digest[1] = h[1]; - digest[2] = h[2]; - digest[3] = h[3]; - digest[4] = h[4]; - digest[5] = h[5]; - digest[6] = h[6]; - digest[7] = h[7]; - - const u32x r0 = h32_from_64 (digest[0]); - const u32x r1 = l32_from_64 (digest[0]); - const u32x r2 = h32_from_64 (digest[1]); - const u32x r3 = l32_from_64 (digest[1]); + const u32x r0 = h32_from_64 (h[0]); + const u32x r1 = l32_from_64 (h[0]); + const u32x r2 = h32_from_64 (h[1]); + const u32x r3 = l32_from_64 (h[1]); COMPARE_S_SIMD (r0, r1, r2, r3); } } -KERNEL_FQ void m00600_s08 (KERN_ATTR_VECTOR_ESALT (blake2_t)) +KERNEL_FQ void m00600_m04 (KERN_ATTR_VECTOR ()) { /** - * modifier + * base */ const u64 gid = get_global_id (0); - const u64 lid = get_local_id (0); - u64 tmp_h[8]; - u64 tmp_t[2]; - u64 tmp_f[2]; + if (gid >= gid_max) return; - tmp_h[0] = esalt_bufs[digests_offset].h[0]; - tmp_h[1] = esalt_bufs[digests_offset].h[1]; - tmp_h[2] = esalt_bufs[digests_offset].h[2]; - tmp_h[3] = esalt_bufs[digests_offset].h[3]; - tmp_h[4] = esalt_bufs[digests_offset].h[4]; - tmp_h[5] = esalt_bufs[digests_offset].h[5]; - tmp_h[6] = esalt_bufs[digests_offset].h[6]; - tmp_h[7] = esalt_bufs[digests_offset].h[7]; + u32 w[16]; - tmp_t[0] = esalt_bufs[digests_offset].t[0]; - tmp_t[1] = esalt_bufs[digests_offset].t[1]; - tmp_f[0] = esalt_bufs[digests_offset].f[0]; - tmp_f[1] = esalt_bufs[digests_offset].f[1]; + w[ 0] = pws[gid].i[ 0]; + w[ 1] = pws[gid].i[ 1]; + w[ 2] = pws[gid].i[ 2]; + w[ 3] = pws[gid].i[ 3]; + w[ 4] = 0; + w[ 5] = 0; + w[ 6] = 0; + w[ 7] = 0; + w[ 8] = 0; + w[ 9] = 0; + w[10] = 0; + w[11] = 0; + w[12] = 0; + w[13] = 0; + w[14] = 0; + w[15] = 0; + + const u32 pw_len = pws[gid].pw_len & 63; /** - * digest + * main */ - const u32 search[4] = - { - digests_buf[digests_offset].digest_buf[DGST_R0], - digests_buf[digests_offset].digest_buf[DGST_R1], - digests_buf[digests_offset].digest_buf[DGST_R2], - digests_buf[digests_offset].digest_buf[DGST_R3] - }; - - /** - * loop - */ - - u32 w0l = pws[gid].i[0]; - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32x w0x = w0l | w0r; - - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; - - w0[0] = w0x; - w0[1] = pws[gid].i[ 1]; - w0[2] = pws[gid].i[ 2]; - w0[3] = pws[gid].i[ 3]; - w1[0] = pws[gid].i[ 4]; - w1[1] = pws[gid].i[ 5]; - w1[2] = pws[gid].i[ 6]; - w1[3] = pws[gid].i[ 7]; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; - - u32x out_len = pws[gid].pw_len; - - u64x digest[8]; - u64x m[16]; - u64x v[16]; - - u64x h[8]; - u64x t[2]; - u64x f[2]; - - h[0] = tmp_h[0]; - h[1] = tmp_h[1]; - h[2] = tmp_h[2]; - h[3] = tmp_h[3]; - h[4] = tmp_h[4]; - h[5] = tmp_h[5]; - h[6] = tmp_h[6]; - h[7] = tmp_h[7]; - - t[0] = tmp_t[0]; - t[1] = tmp_t[1]; - f[0] = tmp_f[0]; - f[1] = tmp_f[1]; - - blake2b_transform (h, t, f, m, v, w0, w1, w2, w3, out_len, BLAKE2B_FINAL); - - digest[0] = h[0]; - digest[1] = h[1]; - digest[2] = h[2]; - digest[3] = h[3]; - digest[4] = h[4]; - digest[5] = h[5]; - digest[6] = h[6]; - digest[7] = h[7]; - - const u32x r0 = h32_from_64 (digest[0]); - const u32x r1 = l32_from_64 (digest[0]); - const u32x r2 = h32_from_64 (digest[1]); - const u32x r3 = l32_from_64 (digest[1]); - - COMPARE_S_SIMD (r0, r1, r2, r3); - } + m00600m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max); } -KERNEL_FQ void m00600_s16 (KERN_ATTR_VECTOR_ESALT (blake2_t)) +KERNEL_FQ void m00600_m08 (KERN_ATTR_VECTOR ()) { /** - * modifier + * base */ const u64 gid = get_global_id (0); - const u64 lid = get_local_id (0); - u64 tmp_h[8]; - u64 tmp_t[2]; - u64 tmp_f[2]; + if (gid >= gid_max) return; - tmp_h[0] = esalt_bufs[digests_offset].h[0]; - tmp_h[1] = esalt_bufs[digests_offset].h[1]; - tmp_h[2] = esalt_bufs[digests_offset].h[2]; - tmp_h[3] = esalt_bufs[digests_offset].h[3]; - tmp_h[4] = esalt_bufs[digests_offset].h[4]; - tmp_h[5] = esalt_bufs[digests_offset].h[5]; - tmp_h[6] = esalt_bufs[digests_offset].h[6]; - tmp_h[7] = esalt_bufs[digests_offset].h[7]; + u32 w[16]; - tmp_t[0] = esalt_bufs[digests_offset].t[0]; - tmp_t[1] = esalt_bufs[digests_offset].t[1]; - tmp_f[0] = esalt_bufs[digests_offset].f[0]; - tmp_f[1] = esalt_bufs[digests_offset].f[1]; + w[ 0] = pws[gid].i[ 0]; + w[ 1] = pws[gid].i[ 1]; + w[ 2] = pws[gid].i[ 2]; + w[ 3] = pws[gid].i[ 3]; + w[ 4] = pws[gid].i[ 4]; + w[ 5] = pws[gid].i[ 5]; + w[ 6] = pws[gid].i[ 6]; + w[ 7] = pws[gid].i[ 7]; + w[ 8] = 0; + w[ 9] = 0; + w[10] = 0; + w[11] = 0; + w[12] = 0; + w[13] = 0; + w[14] = 0; + w[15] = 0; + + const u32 pw_len = pws[gid].pw_len & 63; /** - * digest + * main */ - const u32 search[4] = - { - digests_buf[digests_offset].digest_buf[DGST_R0], - digests_buf[digests_offset].digest_buf[DGST_R1], - digests_buf[digests_offset].digest_buf[DGST_R2], - digests_buf[digests_offset].digest_buf[DGST_R3] - }; - - /** - * loop - */ - - u32 w0l = pws[gid].i[0]; - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32x w0x = w0l | w0r; - - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; - - w0[0] = w0x; - w0[1] = pws[gid].i[ 1]; - w0[2] = pws[gid].i[ 2]; - w0[3] = pws[gid].i[ 3]; - w1[0] = pws[gid].i[ 4]; - w1[1] = pws[gid].i[ 5]; - w1[2] = pws[gid].i[ 6]; - w1[3] = pws[gid].i[ 7]; - w2[0] = pws[gid].i[ 8]; - w2[1] = pws[gid].i[ 9]; - w2[2] = pws[gid].i[10]; - w2[3] = pws[gid].i[11]; - w3[0] = pws[gid].i[12]; - w3[1] = pws[gid].i[13]; - w3[2] = pws[gid].i[14]; - w3[3] = pws[gid].i[15]; - - u32x out_len = pws[gid].pw_len; - - u64x digest[8]; - u64x m[16]; - u64x v[16]; - - u64x h[8]; - u64x t[2]; - u64x f[2]; - - h[0] = tmp_h[0]; - h[1] = tmp_h[1]; - h[2] = tmp_h[2]; - h[3] = tmp_h[3]; - h[4] = tmp_h[4]; - h[5] = tmp_h[5]; - h[6] = tmp_h[6]; - h[7] = tmp_h[7]; - - t[0] = tmp_t[0]; - t[1] = tmp_t[1]; - f[0] = tmp_f[0]; - f[1] = tmp_f[1]; - - blake2b_transform (h, t, f, m, v, w0, w1, w2, w3, out_len, BLAKE2B_FINAL); - - digest[0] = h[0]; - digest[1] = h[1]; - digest[2] = h[2]; - digest[3] = h[3]; - digest[4] = h[4]; - digest[5] = h[5]; - digest[6] = h[6]; - digest[7] = h[7]; - - const u32x r0 = h32_from_64 (digest[0]); - const u32x r1 = l32_from_64 (digest[0]); - const u32x r2 = h32_from_64 (digest[1]); - const u32x r3 = l32_from_64 (digest[1]); - - COMPARE_S_SIMD (r0, r1, r2, r3); - } + m00600m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max); +} + +KERNEL_FQ void m00600_m16 (KERN_ATTR_VECTOR ()) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w[16]; + + w[ 0] = pws[gid].i[ 0]; + w[ 1] = pws[gid].i[ 1]; + w[ 2] = pws[gid].i[ 2]; + w[ 3] = pws[gid].i[ 3]; + w[ 4] = pws[gid].i[ 4]; + w[ 5] = pws[gid].i[ 5]; + w[ 6] = pws[gid].i[ 6]; + w[ 7] = pws[gid].i[ 7]; + w[ 8] = pws[gid].i[ 8]; + w[ 9] = pws[gid].i[ 9]; + w[10] = pws[gid].i[10]; + w[11] = pws[gid].i[11]; + w[12] = pws[gid].i[12]; + w[13] = pws[gid].i[13]; + w[14] = pws[gid].i[14]; + w[15] = pws[gid].i[15]; + + const u32 pw_len = pws[gid].pw_len & 63; + + /** + * main + */ + + m00600m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max); +} + +KERNEL_FQ void m00600_s04 (KERN_ATTR_VECTOR ()) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w[16]; + + w[ 0] = pws[gid].i[ 0]; + w[ 1] = pws[gid].i[ 1]; + w[ 2] = pws[gid].i[ 2]; + w[ 3] = pws[gid].i[ 3]; + w[ 4] = 0; + w[ 5] = 0; + w[ 6] = 0; + w[ 7] = 0; + w[ 8] = 0; + w[ 9] = 0; + w[10] = 0; + w[11] = 0; + w[12] = 0; + w[13] = 0; + w[14] = 0; + w[15] = 0; + + const u32 pw_len = pws[gid].pw_len & 63; + + /** + * main + */ + + m00600s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max); +} + +KERNEL_FQ void m00600_s08 (KERN_ATTR_VECTOR ()) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w[16]; + + w[ 0] = pws[gid].i[ 0]; + w[ 1] = pws[gid].i[ 1]; + w[ 2] = pws[gid].i[ 2]; + w[ 3] = pws[gid].i[ 3]; + w[ 4] = pws[gid].i[ 4]; + w[ 5] = pws[gid].i[ 5]; + w[ 6] = pws[gid].i[ 6]; + w[ 7] = pws[gid].i[ 7]; + w[ 8] = 0; + w[ 9] = 0; + w[10] = 0; + w[11] = 0; + w[12] = 0; + w[13] = 0; + w[14] = 0; + w[15] = 0; + + const u32 pw_len = pws[gid].pw_len & 63; + + /** + * main + */ + + m00600s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max); +} + +KERNEL_FQ void m00600_s16 (KERN_ATTR_VECTOR ()) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w[16]; + + w[ 0] = pws[gid].i[ 0]; + w[ 1] = pws[gid].i[ 1]; + w[ 2] = pws[gid].i[ 2]; + w[ 3] = pws[gid].i[ 3]; + w[ 4] = pws[gid].i[ 4]; + w[ 5] = pws[gid].i[ 5]; + w[ 6] = pws[gid].i[ 6]; + w[ 7] = pws[gid].i[ 7]; + w[ 8] = pws[gid].i[ 8]; + w[ 9] = pws[gid].i[ 9]; + w[10] = pws[gid].i[10]; + w[11] = pws[gid].i[11]; + w[12] = pws[gid].i[12]; + w[13] = pws[gid].i[13]; + w[14] = pws[gid].i[14]; + w[15] = pws[gid].i[15]; + + const u32 pw_len = pws[gid].pw_len & 63; + + /** + * main + */ + + m00600s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, combs_mode, gid_max); } diff --git a/OpenCL/m00600_a3-pure.cl b/OpenCL/m00600_a3-pure.cl new file mode 100644 index 000000000..6f19658b5 --- /dev/null +++ b/OpenCL/m00600_a3-pure.cl @@ -0,0 +1,131 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#ifdef KERNEL_STATIC +#include "inc_vendor.h" +#include "inc_types.h" +#include "inc_platform.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_blake2b.cl" +#endif + +KERNEL_FQ void m00600_mxx (KERN_ATTR_VECTOR ()) +{ + /** + * modifier + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + u32x w[64] = { 0 }; + + for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + blake2b_ctx_vector_t ctx; + + blake2b_init_vector (&ctx); + blake2b_update_vector (&ctx, w, pw_len); + blake2b_final_vector (&ctx); + + const u32x r0 = h32_from_64 (ctx.h[0]); + const u32x r1 = l32_from_64 (ctx.h[0]); + const u32x r2 = h32_from_64 (ctx.h[1]); + const u32x r3 = l32_from_64 (ctx.h[1]); + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +KERNEL_FQ void m00600_sxx (KERN_ATTR_VECTOR ()) +{ + /** + * modifier + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + u32x w[64] = { 0 }; + + for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + blake2b_ctx_vector_t ctx; + + blake2b_init_vector (&ctx); + blake2b_update_vector (&ctx, w, pw_len); + blake2b_final_vector (&ctx); + + const u32x r0 = h32_from_64 (ctx.h[0]); + const u32x r1 = l32_from_64 (ctx.h[0]); + const u32x r2 = h32_from_64 (ctx.h[1]); + const u32x r3 = l32_from_64 (ctx.h[1]); + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} diff --git a/docs/changes.txt b/docs/changes.txt index 420c2102d..3d9745087 100644 --- a/docs/changes.txt +++ b/docs/changes.txt @@ -1,5 +1,11 @@ * changes v6.0.0 -> v6.0.x +## +## Algorithms +## + +- Added pure kernels for hash-mode 600 (BLAKE2b-512) + ## ## Improvements ## diff --git a/src/modules/module_00600.c b/src/modules/module_00600.c index 3a2b13610..32e5a6550 100644 --- a/src/modules/module_00600.c +++ b/src/modules/module_00600.c @@ -42,31 +42,12 @@ u32 module_salt_type (MAYBE_UNUSED const hashconfig_t *hashconfig, const char *module_st_hash (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ST_HASH; } const char *module_st_pass (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ST_PASS; } -typedef struct blake2 -{ - u64 h[8]; - u64 t[2]; - u64 f[2]; - u32 buflen; - u32 outlen; - -} blake2_t; - static const char *SIGNATURE_BLAKE2B = "$BLAKE2$"; -u64 module_esalt_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) -{ - const u64 esalt_size = (const u64) sizeof (blake2_t); - - return esalt_size; -} - int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED void *digest_buf, MAYBE_UNUSED salt_t *salt, MAYBE_UNUSED void *esalt_buf, MAYBE_UNUSED void *hook_salt_buf, MAYBE_UNUSED hashinfo_t *hash_info, const char *line_buf, MAYBE_UNUSED const int line_len) { u64 *digest = (u64 *) digest_buf; - blake2_t *blake2 = (blake2_t *) esalt_buf; - token_t token; token.token_cnt = 2; @@ -97,24 +78,6 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE digest[6] = hex_to_u64 (hash_pos + 96); digest[7] = hex_to_u64 (hash_pos + 112); - // Initialize BLAKE2 Params and State - - memset (blake2, 0, sizeof (blake2_t)); - - blake2->h[0] = BLAKE2B_IV_00; - blake2->h[1] = BLAKE2B_IV_01; - blake2->h[2] = BLAKE2B_IV_02; - blake2->h[3] = BLAKE2B_IV_03; - blake2->h[4] = BLAKE2B_IV_04; - blake2->h[5] = BLAKE2B_IV_05; - blake2->h[6] = BLAKE2B_IV_06; - blake2->h[7] = BLAKE2B_IV_07; - - // blake2->h[0] ^= 0x0000000001010040; // digest_lenght = 0x40, depth = 0x01, fanout = 0x01 - blake2->h[0] ^= 0x40 << 0; - blake2->h[0] ^= 0x01 << 16; - blake2->h[0] ^= 0x01 << 24; - return (PARSER_OK); } @@ -161,7 +124,7 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_dgst_pos3 = module_dgst_pos3; module_ctx->module_dgst_size = module_dgst_size; module_ctx->module_dictstat_disable = MODULE_DEFAULT; - module_ctx->module_esalt_size = module_esalt_size; + module_ctx->module_esalt_size = MODULE_DEFAULT; module_ctx->module_extra_buffer_size = MODULE_DEFAULT; module_ctx->module_extra_tmp_size = MODULE_DEFAULT; module_ctx->module_forced_outfile_format = MODULE_DEFAULT; diff --git a/tools/test_modules/m00600.pm b/tools/test_modules/m00600.pm index a96be2372..8d1883470 100644 --- a/tools/test_modules/m00600.pm +++ b/tools/test_modules/m00600.pm @@ -10,7 +10,7 @@ use warnings; use Digest::BLAKE2 qw (blake2b_hex); -sub module_constraints { [[-1, -1], [-1, -1], [0, 55], [-1, -1], [-1, -1]] } +sub module_constraints { [[0, 256], [-1, -1], [0, 55], [-1, -1], [-1, -1]] } sub module_generate_hash { From 71a9eb2276827647f2132c334c5e825ab47b6d26 Mon Sep 17 00:00:00 2001 From: philsmd Date: Wed, 24 Jun 2020 23:57:00 +0200 Subject: [PATCH 4/8] remove extra spaces in -m 20900 --- OpenCL/m20900_a0-optimized.cl | 8 ++++---- OpenCL/m20900_a1-optimized.cl | 8 ++++---- OpenCL/m20900_a3-optimized.cl | 8 ++++---- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/OpenCL/m20900_a0-optimized.cl b/OpenCL/m20900_a0-optimized.cl index 1b9c24e0c..9699c3211 100644 --- a/OpenCL/m20900_a0-optimized.cl +++ b/OpenCL/m20900_a0-optimized.cl @@ -421,7 +421,7 @@ KERNEL_FQ void m20900_m04 (KERN_ATTR_RULES ()) _w2[0] = 0; _w2[1] = 0; - _w2[2] = 0; + _w2[2] = 0; _w2[3] = 0; _w3[0] = 0; _w3[1] = 0; @@ -593,7 +593,7 @@ KERNEL_FQ void m20900_m04 (KERN_ATTR_RULES ()) _w2[1] = uint_to_hex_lower8 ((e0 >> 8) & 255) << 0 | uint_to_hex_lower8 ((e0 >> 0) & 255) << 16; - _w2[2] = 0; + _w2[2] = 0; _w2[3] = 0; _w3[0] = 0; _w3[1] = 0; @@ -1143,7 +1143,7 @@ KERNEL_FQ void m20900_s04 (KERN_ATTR_RULES ()) _w2[0] = 0; _w2[1] = 0; - _w2[2] = 0; + _w2[2] = 0; _w2[3] = 0; _w3[0] = 0; _w3[1] = 0; @@ -1315,7 +1315,7 @@ KERNEL_FQ void m20900_s04 (KERN_ATTR_RULES ()) _w2[1] = uint_to_hex_lower8 ((e0 >> 8) & 255) << 0 | uint_to_hex_lower8 ((e0 >> 0) & 255) << 16; - _w2[2] = 0; + _w2[2] = 0; _w2[3] = 0; _w3[0] = 0; _w3[1] = 0; diff --git a/OpenCL/m20900_a1-optimized.cl b/OpenCL/m20900_a1-optimized.cl index 89b25c3b0..d74c1c050 100644 --- a/OpenCL/m20900_a1-optimized.cl +++ b/OpenCL/m20900_a1-optimized.cl @@ -477,7 +477,7 @@ KERNEL_FQ void m20900_m04 (KERN_ATTR_BASIC ()) _w2[0] = 0; _w2[1] = 0; - _w2[2] = 0; + _w2[2] = 0; _w2[3] = 0; _w3[0] = 0; _w3[1] = 0; @@ -649,7 +649,7 @@ KERNEL_FQ void m20900_m04 (KERN_ATTR_BASIC ()) _w2[1] = uint_to_hex_lower8 ((e0 >> 8) & 255) << 0 | uint_to_hex_lower8 ((e0 >> 0) & 255) << 16; - _w2[2] = 0; + _w2[2] = 0; _w2[3] = 0; _w3[0] = 0; _w3[1] = 0; @@ -1257,7 +1257,7 @@ KERNEL_FQ void m20900_s04 (KERN_ATTR_BASIC ()) _w2[0] = 0; _w2[1] = 0; - _w2[2] = 0; + _w2[2] = 0; _w2[3] = 0; _w3[0] = 0; _w3[1] = 0; @@ -1429,7 +1429,7 @@ KERNEL_FQ void m20900_s04 (KERN_ATTR_BASIC ()) _w2[1] = uint_to_hex_lower8 ((e0 >> 8) & 255) << 0 | uint_to_hex_lower8 ((e0 >> 0) & 255) << 16; - _w2[2] = 0; + _w2[2] = 0; _w2[3] = 0; _w3[0] = 0; _w3[1] = 0; diff --git a/OpenCL/m20900_a3-optimized.cl b/OpenCL/m20900_a3-optimized.cl index ec8b4c3ab..88d705bb9 100644 --- a/OpenCL/m20900_a3-optimized.cl +++ b/OpenCL/m20900_a3-optimized.cl @@ -378,7 +378,7 @@ DECLSPEC void m20900m (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, KER _w2[0] = 0; _w2[1] = 0; - _w2[2] = 0; + _w2[2] = 0; _w2[3] = 0; _w3[0] = 0; _w3[1] = 0; @@ -550,7 +550,7 @@ DECLSPEC void m20900m (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, KER _w2[1] = uint_to_hex_lower8 ((e0 >> 8) & 255) << 0 | uint_to_hex_lower8 ((e0 >> 0) & 255) << 16; - _w2[2] = 0; + _w2[2] = 0; _w2[3] = 0; _w3[0] = 0; _w3[1] = 0; @@ -1051,7 +1051,7 @@ DECLSPEC void m20900s (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, KER _w2[0] = 0; _w2[1] = 0; - _w2[2] = 0; + _w2[2] = 0; _w2[3] = 0; _w3[0] = 0; _w3[1] = 0; @@ -1223,7 +1223,7 @@ DECLSPEC void m20900s (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, KER _w2[1] = uint_to_hex_lower8 ((e0 >> 8) & 255) << 0 | uint_to_hex_lower8 ((e0 >> 0) & 255) << 16; - _w2[2] = 0; + _w2[2] = 0; _w2[3] = 0; _w3[0] = 0; _w3[1] = 0; From 2f34ad7943ff6c029c8a9a7d87cb8838f4c6a081 Mon Sep 17 00:00:00 2001 From: philsmd Date: Thu, 25 Jun 2020 17:05:32 +0200 Subject: [PATCH 5/8] remove extra newline in blake2b include file --- OpenCL/inc_hash_blake2b.h | 1 - 1 file changed, 1 deletion(-) diff --git a/OpenCL/inc_hash_blake2b.h b/OpenCL/inc_hash_blake2b.h index 798b651b7..702027ce1 100644 --- a/OpenCL/inc_hash_blake2b.h +++ b/OpenCL/inc_hash_blake2b.h @@ -86,5 +86,4 @@ DECLSPEC void blake2b_init_vector (blake2b_ctx_vector_t *ctx); DECLSPEC void blake2b_update_vector (blake2b_ctx_vector_t *ctx, const u32x *w, const u32 len); DECLSPEC void blake2b_final_vector (blake2b_ctx_vector_t *ctx); - #endif // _INC_HASH_BLAKE2B_H From 480466a9545f9802a129f3845c199416cb8b9b78 Mon Sep 17 00:00:00 2001 From: philsmd Date: Sat, 27 Jun 2020 12:40:40 +0200 Subject: [PATCH 6/8] kerberos: do NOT use PWDUMP format with -m 7500/18200 --- src/modules/module_07500.c | 4 +--- src/modules/module_18200.c | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/modules/module_07500.c b/src/modules/module_07500.c index 468380134..cafc72608 100644 --- a/src/modules/module_07500.c +++ b/src/modules/module_07500.c @@ -22,7 +22,6 @@ static const u64 KERN_TYPE = 7500; static const u32 OPTI_TYPE = OPTI_TYPE_ZERO_BYTE | OPTI_TYPE_NOT_ITERATED; static const u64 OPTS_TYPE = OPTS_TYPE_PT_GENERATE_LE; -static const u32 PWDUMP_COLUMN = PWDUMP_COLUMN_NTLM_HASH; static const u32 SALT_TYPE = SALT_TYPE_EMBEDDED; static const char *ST_PASS = "hashcat"; static const char *ST_HASH = "$krb5pa$23$user$realm$salt$5cbb0c882a2b26956e81644edbdb746326f4f5f0e947144fb3095dffe4b4b03e854fc1d631323632303636373330383333353630"; @@ -38,7 +37,6 @@ const char *module_hash_name (MAYBE_UNUSED const hashconfig_t *hashconfig, u64 module_kern_type (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return KERN_TYPE; } u32 module_opti_type (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return OPTI_TYPE; } u64 module_opts_type (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return OPTS_TYPE; } -u32 module_pwdump_column (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return PWDUMP_COLUMN; } u32 module_salt_type (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return SALT_TYPE; } const char *module_st_hash (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ST_HASH; } const char *module_st_pass (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ST_PASS; } @@ -305,7 +303,7 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_potfile_custom_check = MODULE_DEFAULT; module_ctx->module_potfile_disable = MODULE_DEFAULT; module_ctx->module_potfile_keep_all_hashes = MODULE_DEFAULT; - module_ctx->module_pwdump_column = module_pwdump_column; + module_ctx->module_pwdump_column = MODULE_DEFAULT; module_ctx->module_pw_max = MODULE_DEFAULT; module_ctx->module_pw_min = MODULE_DEFAULT; module_ctx->module_salt_max = MODULE_DEFAULT; diff --git a/src/modules/module_18200.c b/src/modules/module_18200.c index 3db491d68..de48f8034 100644 --- a/src/modules/module_18200.c +++ b/src/modules/module_18200.c @@ -22,7 +22,6 @@ static const u64 KERN_TYPE = 18200; static const u32 OPTI_TYPE = OPTI_TYPE_ZERO_BYTE | OPTI_TYPE_NOT_ITERATED; static const u64 OPTS_TYPE = OPTS_TYPE_PT_GENERATE_LE; -static const u32 PWDUMP_COLUMN = PWDUMP_COLUMN_NTLM_HASH; static const u32 SALT_TYPE = SALT_TYPE_EMBEDDED; static const char *ST_PASS = "hashcat"; static const char *ST_HASH = "$krb5asrep$23$user@domain.com:3e156ada591263b8aab0965f5aebd837$007497cb51b6c8116d6407a782ea0e1c5402b17db7afa6b05a6d30ed164a9933c754d720e279c6c573679bd27128fe77e5fea1f72334c1193c8ff0b370fadc6368bf2d49bbfdba4c5dccab95e8c8ebfdc75f438a0797dbfb2f8a1a5f4c423f9bfc1fea483342a11bd56a216f4d5158ccc4b224b52894fadfba3957dfe4b6b8f5f9f9fe422811a314768673e0c924340b8ccb84775ce9defaa3baa0910b676ad0036d13032b0dd94e3b13903cc738a7b6d00b0b3c210d1f972a6c7cae9bd3c959acf7565be528fc179118f28c679f6deeee1456f0781eb8154e18e49cb27b64bf74cd7112a0ebae2102ac"; @@ -38,7 +37,6 @@ const char *module_hash_name (MAYBE_UNUSED const hashconfig_t *hashconfig, u64 module_kern_type (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return KERN_TYPE; } u32 module_opti_type (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return OPTI_TYPE; } u64 module_opts_type (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return OPTS_TYPE; } -u32 module_pwdump_column (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return PWDUMP_COLUMN; } u32 module_salt_type (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return SALT_TYPE; } const char *module_st_hash (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ST_HASH; } const char *module_st_pass (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ST_PASS; } @@ -301,7 +299,7 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_potfile_custom_check = MODULE_DEFAULT; module_ctx->module_potfile_disable = MODULE_DEFAULT; module_ctx->module_potfile_keep_all_hashes = MODULE_DEFAULT; - module_ctx->module_pwdump_column = module_pwdump_column; + module_ctx->module_pwdump_column = MODULE_DEFAULT; module_ctx->module_pw_max = MODULE_DEFAULT; module_ctx->module_pw_min = MODULE_DEFAULT; module_ctx->module_salt_max = MODULE_DEFAULT; From 8c54727a8c780eab017528e63d3e2cfdbfe157a9 Mon Sep 17 00:00:00 2001 From: philsmd Date: Sat, 27 Jun 2020 12:48:36 +0200 Subject: [PATCH 7/8] updates default pw_max length for blake2b --- src/modules/module_00600.c | 16 ++++++++++++++-- tools/test_modules/m00600.pm | 2 +- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/modules/module_00600.c b/src/modules/module_00600.c index 32e5a6550..c22fbde3a 100644 --- a/src/modules/module_00600.c +++ b/src/modules/module_00600.c @@ -23,7 +23,7 @@ static const u32 OPTI_TYPE = OPTI_TYPE_ZERO_BYTE | OPTI_TYPE_USES_BITS_64 | OPTI_TYPE_RAW_HASH; static const u64 OPTS_TYPE = OPTS_TYPE_PT_GENERATE_LE; -static const u32 SALT_TYPE = SALT_TYPE_EMBEDDED; +static const u32 SALT_TYPE = SALT_TYPE_NONE; static const char *ST_PASS = "hashcat"; static const char *ST_HASH = "$BLAKE2$296c269e70ac5f0095e6fb47693480f0f7b97ccd0307f5c3bfa4df8f5ca5c9308a0e7108e80a0a9c0ebb715e8b7109b072046c6cd5e155b4cfd2f27216283b1e"; @@ -44,6 +44,18 @@ const char *module_st_pass (MAYBE_UNUSED const hashconfig_t *hashconfig, static const char *SIGNATURE_BLAKE2B = "$BLAKE2$"; +u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) +{ + // this overrides the reductions of pw_max in case optimized kernel is selected + // IOW, even in optimized kernel mode it support length 64 + + const bool optimized_kernel = (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL); + + const u32 pw_max = (optimized_kernel == true) ? 64 : PW_MAX; + + return pw_max; +} + int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED void *digest_buf, MAYBE_UNUSED salt_t *salt, MAYBE_UNUSED void *esalt_buf, MAYBE_UNUSED void *hook_salt_buf, MAYBE_UNUSED hashinfo_t *hash_info, const char *line_buf, MAYBE_UNUSED const int line_len) { u64 *digest = (u64 *) digest_buf; @@ -166,7 +178,7 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_potfile_disable = MODULE_DEFAULT; module_ctx->module_potfile_keep_all_hashes = MODULE_DEFAULT; module_ctx->module_pwdump_column = MODULE_DEFAULT; - module_ctx->module_pw_max = MODULE_DEFAULT; + module_ctx->module_pw_max = module_pw_max; module_ctx->module_pw_min = MODULE_DEFAULT; module_ctx->module_salt_max = MODULE_DEFAULT; module_ctx->module_salt_min = MODULE_DEFAULT; diff --git a/tools/test_modules/m00600.pm b/tools/test_modules/m00600.pm index 8d1883470..a0553f762 100644 --- a/tools/test_modules/m00600.pm +++ b/tools/test_modules/m00600.pm @@ -10,7 +10,7 @@ use warnings; use Digest::BLAKE2 qw (blake2b_hex); -sub module_constraints { [[0, 256], [-1, -1], [0, 55], [-1, -1], [-1, -1]] } +sub module_constraints { [[0, 256], [-1, -1], [0, 64], [-1, -1], [-1, -1]] } sub module_generate_hash { From 9ce625464e64b6b0c18299e4a5ecc495198c1af4 Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Tue, 30 Jun 2020 13:28:23 +0200 Subject: [PATCH 8/8] Fixed uninitialized value in bitsliced DES kernel (BF mode only) leading to false negatives --- OpenCL/m14000_a3-pure.cl | 515 ++++++++++++++++++++------------------- docs/changes.txt | 6 + 2 files changed, 264 insertions(+), 257 deletions(-) diff --git a/OpenCL/m14000_a3-pure.cl b/OpenCL/m14000_a3-pure.cl index 50bf08343..72075cdc3 100644 --- a/OpenCL/m14000_a3-pure.cl +++ b/OpenCL/m14000_a3-pure.cl @@ -1772,135 +1772,71 @@ KERNEL_FQ void m14000_mxx (KERN_ATTR_BITSLICE ()) // salt1 first, because this is a 64 bit value actually - #define d00 (((salt1 >> 0) & 1) ? -1 : 0) - #define d01 (((salt1 >> 1) & 1) ? -1 : 0) - #define d02 (((salt1 >> 2) & 1) ? -1 : 0) - #define d03 (((salt1 >> 3) & 1) ? -1 : 0) - #define d04 (((salt1 >> 4) & 1) ? -1 : 0) - #define d05 (((salt1 >> 5) & 1) ? -1 : 0) - #define d06 (((salt1 >> 6) & 1) ? -1 : 0) - #define d07 (((salt1 >> 7) & 1) ? -1 : 0) - #define d08 (((salt1 >> 8) & 1) ? -1 : 0) - #define d09 (((salt1 >> 9) & 1) ? -1 : 0) - #define d10 (((salt1 >> 10) & 1) ? -1 : 0) - #define d11 (((salt1 >> 11) & 1) ? -1 : 0) - #define d12 (((salt1 >> 12) & 1) ? -1 : 0) - #define d13 (((salt1 >> 13) & 1) ? -1 : 0) - #define d14 (((salt1 >> 14) & 1) ? -1 : 0) - #define d15 (((salt1 >> 15) & 1) ? -1 : 0) - #define d16 (((salt1 >> 16) & 1) ? -1 : 0) - #define d17 (((salt1 >> 17) & 1) ? -1 : 0) - #define d18 (((salt1 >> 18) & 1) ? -1 : 0) - #define d19 (((salt1 >> 19) & 1) ? -1 : 0) - #define d20 (((salt1 >> 20) & 1) ? -1 : 0) - #define d21 (((salt1 >> 21) & 1) ? -1 : 0) - #define d22 (((salt1 >> 22) & 1) ? -1 : 0) - #define d23 (((salt1 >> 23) & 1) ? -1 : 0) - #define d24 (((salt1 >> 24) & 1) ? -1 : 0) - #define d25 (((salt1 >> 25) & 1) ? -1 : 0) - #define d26 (((salt1 >> 26) & 1) ? -1 : 0) - #define d27 (((salt1 >> 27) & 1) ? -1 : 0) - #define d28 (((salt1 >> 28) & 1) ? -1 : 0) - #define d29 (((salt1 >> 29) & 1) ? -1 : 0) - #define d30 (((salt1 >> 30) & 1) ? -1 : 0) - #define d31 (((salt1 >> 31) & 1) ? -1 : 0) - #define d32 (((salt0 >> 0) & 1) ? -1 : 0) - #define d33 (((salt0 >> 1) & 1) ? -1 : 0) - #define d34 (((salt0 >> 2) & 1) ? -1 : 0) - #define d35 (((salt0 >> 3) & 1) ? -1 : 0) - #define d36 (((salt0 >> 4) & 1) ? -1 : 0) - #define d37 (((salt0 >> 5) & 1) ? -1 : 0) - #define d38 (((salt0 >> 6) & 1) ? -1 : 0) - #define d39 (((salt0 >> 7) & 1) ? -1 : 0) - #define d40 (((salt0 >> 8) & 1) ? -1 : 0) - #define d41 (((salt0 >> 9) & 1) ? -1 : 0) - #define d42 (((salt0 >> 10) & 1) ? -1 : 0) - #define d43 (((salt0 >> 11) & 1) ? -1 : 0) - #define d44 (((salt0 >> 12) & 1) ? -1 : 0) - #define d45 (((salt0 >> 13) & 1) ? -1 : 0) - #define d46 (((salt0 >> 14) & 1) ? -1 : 0) - #define d47 (((salt0 >> 15) & 1) ? -1 : 0) - #define d48 (((salt0 >> 16) & 1) ? -1 : 0) - #define d49 (((salt0 >> 17) & 1) ? -1 : 0) - #define d50 (((salt0 >> 18) & 1) ? -1 : 0) - #define d51 (((salt0 >> 19) & 1) ? -1 : 0) - #define d52 (((salt0 >> 20) & 1) ? -1 : 0) - #define d53 (((salt0 >> 21) & 1) ? -1 : 0) - #define d54 (((salt0 >> 22) & 1) ? -1 : 0) - #define d55 (((salt0 >> 23) & 1) ? -1 : 0) - #define d56 (((salt0 >> 24) & 1) ? -1 : 0) - #define d57 (((salt0 >> 25) & 1) ? -1 : 0) - #define d58 (((salt0 >> 26) & 1) ? -1 : 0) - #define d59 (((salt0 >> 27) & 1) ? -1 : 0) - #define d60 (((salt0 >> 28) & 1) ? -1 : 0) - #define d61 (((salt0 >> 29) & 1) ? -1 : 0) - #define d62 (((salt0 >> 30) & 1) ? -1 : 0) - #define d63 (((salt0 >> 31) & 1) ? -1 : 0) - u32 D00 = d00; - u32 D01 = d01; - u32 D02 = d02; - u32 D03 = d03; - u32 D04 = d04; - u32 D05 = d05; - u32 D06 = d06; - u32 D07 = d07; - u32 D08 = d08; - u32 D09 = d09; - u32 D10 = d10; - u32 D11 = d11; - u32 D12 = d12; - u32 D13 = d13; - u32 D14 = d14; - u32 D15 = d15; - u32 D16 = d16; - u32 D17 = d17; - u32 D18 = d18; - u32 D19 = d19; - u32 D20 = d20; - u32 D21 = d21; - u32 D22 = d22; - u32 D23 = d23; - u32 D24 = d24; - u32 D25 = d25; - u32 D26 = d26; - u32 D27 = d27; - u32 D28 = d28; - u32 D29 = d29; - u32 D30 = d30; - u32 D31 = d31; - u32 D32 = d32; - u32 D33 = d33; - u32 D34 = d34; - u32 D35 = d35; - u32 D36 = d36; - u32 D37 = d37; - u32 D38 = d38; - u32 D39 = d39; - u32 D40 = d40; - u32 D41 = d41; - u32 D42 = d42; - u32 D43 = d43; - u32 D44 = d44; - u32 D45 = d45; - u32 D46 = d46; - u32 D47 = d47; - u32 D48 = d48; - u32 D49 = d49; - u32 D50 = d50; - u32 D51 = d51; - u32 D52 = d52; - u32 D53 = d53; - u32 D54 = d54; - u32 D55 = d55; - u32 D56 = d56; - u32 D57 = d57; - u32 D58 = d58; - u32 D59 = d59; - u32 D60 = d60; - u32 D61 = d61; - u32 D62 = d62; - u32 D63 = d63; + const u32 d00 = (((salt1 >> 0) & 1) ? -1 : 0); + const u32 d01 = (((salt1 >> 1) & 1) ? -1 : 0); + const u32 d02 = (((salt1 >> 2) & 1) ? -1 : 0); + const u32 d03 = (((salt1 >> 3) & 1) ? -1 : 0); + const u32 d04 = (((salt1 >> 4) & 1) ? -1 : 0); + const u32 d05 = (((salt1 >> 5) & 1) ? -1 : 0); + const u32 d06 = (((salt1 >> 6) & 1) ? -1 : 0); + const u32 d07 = (((salt1 >> 7) & 1) ? -1 : 0); + const u32 d08 = (((salt1 >> 8) & 1) ? -1 : 0); + const u32 d09 = (((salt1 >> 9) & 1) ? -1 : 0); + const u32 d10 = (((salt1 >> 10) & 1) ? -1 : 0); + const u32 d11 = (((salt1 >> 11) & 1) ? -1 : 0); + const u32 d12 = (((salt1 >> 12) & 1) ? -1 : 0); + const u32 d13 = (((salt1 >> 13) & 1) ? -1 : 0); + const u32 d14 = (((salt1 >> 14) & 1) ? -1 : 0); + const u32 d15 = (((salt1 >> 15) & 1) ? -1 : 0); + const u32 d16 = (((salt1 >> 16) & 1) ? -1 : 0); + const u32 d17 = (((salt1 >> 17) & 1) ? -1 : 0); + const u32 d18 = (((salt1 >> 18) & 1) ? -1 : 0); + const u32 d19 = (((salt1 >> 19) & 1) ? -1 : 0); + const u32 d20 = (((salt1 >> 20) & 1) ? -1 : 0); + const u32 d21 = (((salt1 >> 21) & 1) ? -1 : 0); + const u32 d22 = (((salt1 >> 22) & 1) ? -1 : 0); + const u32 d23 = (((salt1 >> 23) & 1) ? -1 : 0); + const u32 d24 = (((salt1 >> 24) & 1) ? -1 : 0); + const u32 d25 = (((salt1 >> 25) & 1) ? -1 : 0); + const u32 d26 = (((salt1 >> 26) & 1) ? -1 : 0); + const u32 d27 = (((salt1 >> 27) & 1) ? -1 : 0); + const u32 d28 = (((salt1 >> 28) & 1) ? -1 : 0); + const u32 d29 = (((salt1 >> 29) & 1) ? -1 : 0); + const u32 d30 = (((salt1 >> 30) & 1) ? -1 : 0); + const u32 d31 = (((salt1 >> 31) & 1) ? -1 : 0); + const u32 d32 = (((salt0 >> 0) & 1) ? -1 : 0); + const u32 d33 = (((salt0 >> 1) & 1) ? -1 : 0); + const u32 d34 = (((salt0 >> 2) & 1) ? -1 : 0); + const u32 d35 = (((salt0 >> 3) & 1) ? -1 : 0); + const u32 d36 = (((salt0 >> 4) & 1) ? -1 : 0); + const u32 d37 = (((salt0 >> 5) & 1) ? -1 : 0); + const u32 d38 = (((salt0 >> 6) & 1) ? -1 : 0); + const u32 d39 = (((salt0 >> 7) & 1) ? -1 : 0); + const u32 d40 = (((salt0 >> 8) & 1) ? -1 : 0); + const u32 d41 = (((salt0 >> 9) & 1) ? -1 : 0); + const u32 d42 = (((salt0 >> 10) & 1) ? -1 : 0); + const u32 d43 = (((salt0 >> 11) & 1) ? -1 : 0); + const u32 d44 = (((salt0 >> 12) & 1) ? -1 : 0); + const u32 d45 = (((salt0 >> 13) & 1) ? -1 : 0); + const u32 d46 = (((salt0 >> 14) & 1) ? -1 : 0); + const u32 d47 = (((salt0 >> 15) & 1) ? -1 : 0); + const u32 d48 = (((salt0 >> 16) & 1) ? -1 : 0); + const u32 d49 = (((salt0 >> 17) & 1) ? -1 : 0); + const u32 d50 = (((salt0 >> 18) & 1) ? -1 : 0); + const u32 d51 = (((salt0 >> 19) & 1) ? -1 : 0); + const u32 d52 = (((salt0 >> 20) & 1) ? -1 : 0); + const u32 d53 = (((salt0 >> 21) & 1) ? -1 : 0); + const u32 d54 = (((salt0 >> 22) & 1) ? -1 : 0); + const u32 d55 = (((salt0 >> 23) & 1) ? -1 : 0); + const u32 d56 = (((salt0 >> 24) & 1) ? -1 : 0); + const u32 d57 = (((salt0 >> 25) & 1) ? -1 : 0); + const u32 d58 = (((salt0 >> 26) & 1) ? -1 : 0); + const u32 d59 = (((salt0 >> 27) & 1) ? -1 : 0); + const u32 d60 = (((salt0 >> 28) & 1) ? -1 : 0); + const u32 d61 = (((salt0 >> 29) & 1) ? -1 : 0); + const u32 d62 = (((salt0 >> 30) & 1) ? -1 : 0); + const u32 d63 = (((salt0 >> 31) & 1) ? -1 : 0); /** * base @@ -2032,6 +1968,71 @@ KERNEL_FQ void m14000_mxx (KERN_ATTR_BITSLICE ()) k26 |= words_buf_s[pc_pos].b[26]; k27 |= words_buf_s[pc_pos].b[27]; + u32 D00 = d00; + u32 D01 = d01; + u32 D02 = d02; + u32 D03 = d03; + u32 D04 = d04; + u32 D05 = d05; + u32 D06 = d06; + u32 D07 = d07; + u32 D08 = d08; + u32 D09 = d09; + u32 D10 = d10; + u32 D11 = d11; + u32 D12 = d12; + u32 D13 = d13; + u32 D14 = d14; + u32 D15 = d15; + u32 D16 = d16; + u32 D17 = d17; + u32 D18 = d18; + u32 D19 = d19; + u32 D20 = d20; + u32 D21 = d21; + u32 D22 = d22; + u32 D23 = d23; + u32 D24 = d24; + u32 D25 = d25; + u32 D26 = d26; + u32 D27 = d27; + u32 D28 = d28; + u32 D29 = d29; + u32 D30 = d30; + u32 D31 = d31; + u32 D32 = d32; + u32 D33 = d33; + u32 D34 = d34; + u32 D35 = d35; + u32 D36 = d36; + u32 D37 = d37; + u32 D38 = d38; + u32 D39 = d39; + u32 D40 = d40; + u32 D41 = d41; + u32 D42 = d42; + u32 D43 = d43; + u32 D44 = d44; + u32 D45 = d45; + u32 D46 = d46; + u32 D47 = d47; + u32 D48 = d48; + u32 D49 = d49; + u32 D50 = d50; + u32 D51 = d51; + u32 D52 = d52; + u32 D53 = d53; + u32 D54 = d54; + u32 D55 = d55; + u32 D56 = d56; + u32 D57 = d57; + u32 D58 = d58; + u32 D59 = d59; + u32 D60 = d60; + u32 D61 = d61; + u32 D62 = d62; + u32 D63 = d63; + DES ( k00, k01, k02, k03, k04, k05, k06, @@ -2213,135 +2214,70 @@ KERNEL_FQ void m14000_sxx (KERN_ATTR_BITSLICE ()) // salt1 first, because this is a 64 bit value actually - #define d00 (((salt1 >> 0) & 1) ? -1 : 0) - #define d01 (((salt1 >> 1) & 1) ? -1 : 0) - #define d02 (((salt1 >> 2) & 1) ? -1 : 0) - #define d03 (((salt1 >> 3) & 1) ? -1 : 0) - #define d04 (((salt1 >> 4) & 1) ? -1 : 0) - #define d05 (((salt1 >> 5) & 1) ? -1 : 0) - #define d06 (((salt1 >> 6) & 1) ? -1 : 0) - #define d07 (((salt1 >> 7) & 1) ? -1 : 0) - #define d08 (((salt1 >> 8) & 1) ? -1 : 0) - #define d09 (((salt1 >> 9) & 1) ? -1 : 0) - #define d10 (((salt1 >> 10) & 1) ? -1 : 0) - #define d11 (((salt1 >> 11) & 1) ? -1 : 0) - #define d12 (((salt1 >> 12) & 1) ? -1 : 0) - #define d13 (((salt1 >> 13) & 1) ? -1 : 0) - #define d14 (((salt1 >> 14) & 1) ? -1 : 0) - #define d15 (((salt1 >> 15) & 1) ? -1 : 0) - #define d16 (((salt1 >> 16) & 1) ? -1 : 0) - #define d17 (((salt1 >> 17) & 1) ? -1 : 0) - #define d18 (((salt1 >> 18) & 1) ? -1 : 0) - #define d19 (((salt1 >> 19) & 1) ? -1 : 0) - #define d20 (((salt1 >> 20) & 1) ? -1 : 0) - #define d21 (((salt1 >> 21) & 1) ? -1 : 0) - #define d22 (((salt1 >> 22) & 1) ? -1 : 0) - #define d23 (((salt1 >> 23) & 1) ? -1 : 0) - #define d24 (((salt1 >> 24) & 1) ? -1 : 0) - #define d25 (((salt1 >> 25) & 1) ? -1 : 0) - #define d26 (((salt1 >> 26) & 1) ? -1 : 0) - #define d27 (((salt1 >> 27) & 1) ? -1 : 0) - #define d28 (((salt1 >> 28) & 1) ? -1 : 0) - #define d29 (((salt1 >> 29) & 1) ? -1 : 0) - #define d30 (((salt1 >> 30) & 1) ? -1 : 0) - #define d31 (((salt1 >> 31) & 1) ? -1 : 0) - #define d32 (((salt0 >> 0) & 1) ? -1 : 0) - #define d33 (((salt0 >> 1) & 1) ? -1 : 0) - #define d34 (((salt0 >> 2) & 1) ? -1 : 0) - #define d35 (((salt0 >> 3) & 1) ? -1 : 0) - #define d36 (((salt0 >> 4) & 1) ? -1 : 0) - #define d37 (((salt0 >> 5) & 1) ? -1 : 0) - #define d38 (((salt0 >> 6) & 1) ? -1 : 0) - #define d39 (((salt0 >> 7) & 1) ? -1 : 0) - #define d40 (((salt0 >> 8) & 1) ? -1 : 0) - #define d41 (((salt0 >> 9) & 1) ? -1 : 0) - #define d42 (((salt0 >> 10) & 1) ? -1 : 0) - #define d43 (((salt0 >> 11) & 1) ? -1 : 0) - #define d44 (((salt0 >> 12) & 1) ? -1 : 0) - #define d45 (((salt0 >> 13) & 1) ? -1 : 0) - #define d46 (((salt0 >> 14) & 1) ? -1 : 0) - #define d47 (((salt0 >> 15) & 1) ? -1 : 0) - #define d48 (((salt0 >> 16) & 1) ? -1 : 0) - #define d49 (((salt0 >> 17) & 1) ? -1 : 0) - #define d50 (((salt0 >> 18) & 1) ? -1 : 0) - #define d51 (((salt0 >> 19) & 1) ? -1 : 0) - #define d52 (((salt0 >> 20) & 1) ? -1 : 0) - #define d53 (((salt0 >> 21) & 1) ? -1 : 0) - #define d54 (((salt0 >> 22) & 1) ? -1 : 0) - #define d55 (((salt0 >> 23) & 1) ? -1 : 0) - #define d56 (((salt0 >> 24) & 1) ? -1 : 0) - #define d57 (((salt0 >> 25) & 1) ? -1 : 0) - #define d58 (((salt0 >> 26) & 1) ? -1 : 0) - #define d59 (((salt0 >> 27) & 1) ? -1 : 0) - #define d60 (((salt0 >> 28) & 1) ? -1 : 0) - #define d61 (((salt0 >> 29) & 1) ? -1 : 0) - #define d62 (((salt0 >> 30) & 1) ? -1 : 0) - #define d63 (((salt0 >> 31) & 1) ? -1 : 0) - - u32 D00 = d00; - u32 D01 = d01; - u32 D02 = d02; - u32 D03 = d03; - u32 D04 = d04; - u32 D05 = d05; - u32 D06 = d06; - u32 D07 = d07; - u32 D08 = d08; - u32 D09 = d09; - u32 D10 = d10; - u32 D11 = d11; - u32 D12 = d12; - u32 D13 = d13; - u32 D14 = d14; - u32 D15 = d15; - u32 D16 = d16; - u32 D17 = d17; - u32 D18 = d18; - u32 D19 = d19; - u32 D20 = d20; - u32 D21 = d21; - u32 D22 = d22; - u32 D23 = d23; - u32 D24 = d24; - u32 D25 = d25; - u32 D26 = d26; - u32 D27 = d27; - u32 D28 = d28; - u32 D29 = d29; - u32 D30 = d30; - u32 D31 = d31; - u32 D32 = d32; - u32 D33 = d33; - u32 D34 = d34; - u32 D35 = d35; - u32 D36 = d36; - u32 D37 = d37; - u32 D38 = d38; - u32 D39 = d39; - u32 D40 = d40; - u32 D41 = d41; - u32 D42 = d42; - u32 D43 = d43; - u32 D44 = d44; - u32 D45 = d45; - u32 D46 = d46; - u32 D47 = d47; - u32 D48 = d48; - u32 D49 = d49; - u32 D50 = d50; - u32 D51 = d51; - u32 D52 = d52; - u32 D53 = d53; - u32 D54 = d54; - u32 D55 = d55; - u32 D56 = d56; - u32 D57 = d57; - u32 D58 = d58; - u32 D59 = d59; - u32 D60 = d60; - u32 D61 = d61; - u32 D62 = d62; - u32 D63 = d63; + const u32 d00 = (((salt1 >> 0) & 1) ? -1 : 0); + const u32 d01 = (((salt1 >> 1) & 1) ? -1 : 0); + const u32 d02 = (((salt1 >> 2) & 1) ? -1 : 0); + const u32 d03 = (((salt1 >> 3) & 1) ? -1 : 0); + const u32 d04 = (((salt1 >> 4) & 1) ? -1 : 0); + const u32 d05 = (((salt1 >> 5) & 1) ? -1 : 0); + const u32 d06 = (((salt1 >> 6) & 1) ? -1 : 0); + const u32 d07 = (((salt1 >> 7) & 1) ? -1 : 0); + const u32 d08 = (((salt1 >> 8) & 1) ? -1 : 0); + const u32 d09 = (((salt1 >> 9) & 1) ? -1 : 0); + const u32 d10 = (((salt1 >> 10) & 1) ? -1 : 0); + const u32 d11 = (((salt1 >> 11) & 1) ? -1 : 0); + const u32 d12 = (((salt1 >> 12) & 1) ? -1 : 0); + const u32 d13 = (((salt1 >> 13) & 1) ? -1 : 0); + const u32 d14 = (((salt1 >> 14) & 1) ? -1 : 0); + const u32 d15 = (((salt1 >> 15) & 1) ? -1 : 0); + const u32 d16 = (((salt1 >> 16) & 1) ? -1 : 0); + const u32 d17 = (((salt1 >> 17) & 1) ? -1 : 0); + const u32 d18 = (((salt1 >> 18) & 1) ? -1 : 0); + const u32 d19 = (((salt1 >> 19) & 1) ? -1 : 0); + const u32 d20 = (((salt1 >> 20) & 1) ? -1 : 0); + const u32 d21 = (((salt1 >> 21) & 1) ? -1 : 0); + const u32 d22 = (((salt1 >> 22) & 1) ? -1 : 0); + const u32 d23 = (((salt1 >> 23) & 1) ? -1 : 0); + const u32 d24 = (((salt1 >> 24) & 1) ? -1 : 0); + const u32 d25 = (((salt1 >> 25) & 1) ? -1 : 0); + const u32 d26 = (((salt1 >> 26) & 1) ? -1 : 0); + const u32 d27 = (((salt1 >> 27) & 1) ? -1 : 0); + const u32 d28 = (((salt1 >> 28) & 1) ? -1 : 0); + const u32 d29 = (((salt1 >> 29) & 1) ? -1 : 0); + const u32 d30 = (((salt1 >> 30) & 1) ? -1 : 0); + const u32 d31 = (((salt1 >> 31) & 1) ? -1 : 0); + const u32 d32 = (((salt0 >> 0) & 1) ? -1 : 0); + const u32 d33 = (((salt0 >> 1) & 1) ? -1 : 0); + const u32 d34 = (((salt0 >> 2) & 1) ? -1 : 0); + const u32 d35 = (((salt0 >> 3) & 1) ? -1 : 0); + const u32 d36 = (((salt0 >> 4) & 1) ? -1 : 0); + const u32 d37 = (((salt0 >> 5) & 1) ? -1 : 0); + const u32 d38 = (((salt0 >> 6) & 1) ? -1 : 0); + const u32 d39 = (((salt0 >> 7) & 1) ? -1 : 0); + const u32 d40 = (((salt0 >> 8) & 1) ? -1 : 0); + const u32 d41 = (((salt0 >> 9) & 1) ? -1 : 0); + const u32 d42 = (((salt0 >> 10) & 1) ? -1 : 0); + const u32 d43 = (((salt0 >> 11) & 1) ? -1 : 0); + const u32 d44 = (((salt0 >> 12) & 1) ? -1 : 0); + const u32 d45 = (((salt0 >> 13) & 1) ? -1 : 0); + const u32 d46 = (((salt0 >> 14) & 1) ? -1 : 0); + const u32 d47 = (((salt0 >> 15) & 1) ? -1 : 0); + const u32 d48 = (((salt0 >> 16) & 1) ? -1 : 0); + const u32 d49 = (((salt0 >> 17) & 1) ? -1 : 0); + const u32 d50 = (((salt0 >> 18) & 1) ? -1 : 0); + const u32 d51 = (((salt0 >> 19) & 1) ? -1 : 0); + const u32 d52 = (((salt0 >> 20) & 1) ? -1 : 0); + const u32 d53 = (((salt0 >> 21) & 1) ? -1 : 0); + const u32 d54 = (((salt0 >> 22) & 1) ? -1 : 0); + const u32 d55 = (((salt0 >> 23) & 1) ? -1 : 0); + const u32 d56 = (((salt0 >> 24) & 1) ? -1 : 0); + const u32 d57 = (((salt0 >> 25) & 1) ? -1 : 0); + const u32 d58 = (((salt0 >> 26) & 1) ? -1 : 0); + const u32 d59 = (((salt0 >> 27) & 1) ? -1 : 0); + const u32 d60 = (((salt0 >> 28) & 1) ? -1 : 0); + const u32 d61 = (((salt0 >> 29) & 1) ? -1 : 0); + const u32 d62 = (((salt0 >> 30) & 1) ? -1 : 0); + const u32 d63 = (((salt0 >> 31) & 1) ? -1 : 0); /** * digest @@ -2545,6 +2481,71 @@ KERNEL_FQ void m14000_sxx (KERN_ATTR_BITSLICE ()) k26 |= words_buf_s[pc_pos].b[26]; k27 |= words_buf_s[pc_pos].b[27]; + u32 D00 = d00; + u32 D01 = d01; + u32 D02 = d02; + u32 D03 = d03; + u32 D04 = d04; + u32 D05 = d05; + u32 D06 = d06; + u32 D07 = d07; + u32 D08 = d08; + u32 D09 = d09; + u32 D10 = d10; + u32 D11 = d11; + u32 D12 = d12; + u32 D13 = d13; + u32 D14 = d14; + u32 D15 = d15; + u32 D16 = d16; + u32 D17 = d17; + u32 D18 = d18; + u32 D19 = d19; + u32 D20 = d20; + u32 D21 = d21; + u32 D22 = d22; + u32 D23 = d23; + u32 D24 = d24; + u32 D25 = d25; + u32 D26 = d26; + u32 D27 = d27; + u32 D28 = d28; + u32 D29 = d29; + u32 D30 = d30; + u32 D31 = d31; + u32 D32 = d32; + u32 D33 = d33; + u32 D34 = d34; + u32 D35 = d35; + u32 D36 = d36; + u32 D37 = d37; + u32 D38 = d38; + u32 D39 = d39; + u32 D40 = d40; + u32 D41 = d41; + u32 D42 = d42; + u32 D43 = d43; + u32 D44 = d44; + u32 D45 = d45; + u32 D46 = d46; + u32 D47 = d47; + u32 D48 = d48; + u32 D49 = d49; + u32 D50 = d50; + u32 D51 = d51; + u32 D52 = d52; + u32 D53 = d53; + u32 D54 = d54; + u32 D55 = d55; + u32 D56 = d56; + u32 D57 = d57; + u32 D58 = d58; + u32 D59 = d59; + u32 D60 = d60; + u32 D61 = d61; + u32 D62 = d62; + u32 D63 = d63; + DES ( k00, k01, k02, k03, k04, k05, k06, diff --git a/docs/changes.txt b/docs/changes.txt index 420c2102d..a7b482d3f 100644 --- a/docs/changes.txt +++ b/docs/changes.txt @@ -1,5 +1,11 @@ * changes v6.0.0 -> v6.0.x +## +## Bugs +## + +- Fixed uninitialized value in bitsliced DES kernel (BF mode only) leading to false negatives + ## ## Improvements ##