From 50f39b3563fa429dbec441288bd9a21fc328c21e Mon Sep 17 00:00:00 2001 From: jsteube Date: Tue, 15 Dec 2015 13:42:37 +0100 Subject: [PATCH] Fix append_* function calls --- OpenCL/common.c | 6396 ++++++++++++++++++++++--------------------- OpenCL/m00010_a0.cl | 4 +- OpenCL/m00010_a1.cl | 4 +- OpenCL/m00020_a0.cl | 4 +- OpenCL/m00020_a1.cl | 4 +- OpenCL/m00030_a0.cl | 4 +- OpenCL/m00030_a1.cl | 4 +- OpenCL/m00040_a0.cl | 4 +- OpenCL/m00040_a1.cl | 4 +- OpenCL/m00060_a0.cl | 4 +- OpenCL/m00060_a1.cl | 4 +- OpenCL/m00060_a3.cl | 4 +- OpenCL/m00100_a0.cl | 4 +- OpenCL/m00100_a1.cl | 8 +- OpenCL/m00110_a0.cl | 4 +- OpenCL/m00110_a1.cl | 4 +- OpenCL/m00120_a0.cl | 4 +- OpenCL/m00120_a1.cl | 4 +- OpenCL/m00130_a0.cl | 4 +- OpenCL/m00130_a1.cl | 4 +- OpenCL/m00140_a0.cl | 4 +- OpenCL/m00140_a1.cl | 4 +- OpenCL/m00160_a0.cl | 4 +- OpenCL/m00160_a1.cl | 4 +- OpenCL/m00190_a0.cl | 4 +- OpenCL/m00190_a1.cl | 8 +- OpenCL/m00300_a0.cl | 4 +- OpenCL/m00300_a1.cl | 8 +- OpenCL/m00900_a0.cl | 4 +- OpenCL/m00900_a1.cl | 4 +- OpenCL/m01000_a0.cl | 4 +- OpenCL/m01000_a1.cl | 4 +- OpenCL/m01100_a0.cl | 4 +- OpenCL/m01100_a1.cl | 4 +- OpenCL/m01400_a0.cl | 4 +- OpenCL/m01400_a1.cl | 8 +- OpenCL/m01410_a0.cl | 4 +- OpenCL/m01410_a1.cl | 4 +- OpenCL/m01420_a0.cl | 4 +- OpenCL/m01420_a1.cl | 4 +- OpenCL/m01430_a0.cl | 4 +- OpenCL/m01430_a1.cl | 4 +- OpenCL/m01440_a0.cl | 4 +- OpenCL/m01440_a1.cl | 4 +- OpenCL/m01460_a0.cl | 4 +- OpenCL/m01460_a1.cl | 4 +- OpenCL/m01600.cl | 6 +- OpenCL/m01700_a0.cl | 4 +- OpenCL/m01700_a1.cl | 8 +- OpenCL/m01710_a0.cl | 4 +- OpenCL/m01710_a1.cl | 4 +- OpenCL/m01720_a0.cl | 4 +- OpenCL/m01720_a1.cl | 4 +- OpenCL/m01730_a0.cl | 4 +- OpenCL/m01730_a1.cl | 4 +- OpenCL/m01740_a0.cl | 4 +- OpenCL/m01740_a1.cl | 4 +- OpenCL/m01760_a0.cl | 4 +- OpenCL/m01760_a1.cl | 4 +- OpenCL/m02100.cl | 8 +- OpenCL/m02500.cl | 6 +- OpenCL/m02610_a0.cl | 4 +- OpenCL/m02610_a1.cl | 4 +- OpenCL/m02710_a0.cl | 4 +- OpenCL/m02710_a1.cl | 4 +- OpenCL/m02810_a0.cl | 4 +- OpenCL/m02810_a1.cl | 4 +- OpenCL/m03710_a0.cl | 4 +- OpenCL/m03710_a1.cl | 4 +- OpenCL/m03800_a0.cl | 4 +- OpenCL/m03800_a1.cl | 4 +- OpenCL/m03800_a3.cl | 4 +- OpenCL/m04310_a0.cl | 4 +- OpenCL/m04310_a1.cl | 4 +- OpenCL/m04400_a0.cl | 4 +- OpenCL/m04400_a1.cl | 8 +- OpenCL/m04500_a0.cl | 4 +- OpenCL/m04500_a1.cl | 8 +- OpenCL/m04700_a0.cl | 4 +- OpenCL/m04700_a1.cl | 8 +- OpenCL/m04900_a0.cl | 4 +- OpenCL/m04900_a1.cl | 4 +- OpenCL/m04900_a3.cl | 4 +- OpenCL/m05000_a0.cl | 4 +- OpenCL/m05000_a1.cl | 8 +- OpenCL/m05100_a0.cl | 4 +- OpenCL/m05100_a1.cl | 4 +- OpenCL/m05200.cl | 2 +- OpenCL/m05500_a0.cl | 4 +- OpenCL/m05500_a1.cl | 4 +- OpenCL/m05600_a0.cl | 4 +- OpenCL/m05600_a1.cl | 4 +- OpenCL/m06000_a0.cl | 4 +- OpenCL/m06000_a1.cl | 4 +- OpenCL/m06100_a0.cl | 4 +- OpenCL/m06100_a1.cl | 4 +- OpenCL/m06300.cl | 6 +- OpenCL/m06400.cl | 4 +- OpenCL/m06500.cl | 4 +- OpenCL/m06600.cl | 4 +- OpenCL/m06700.cl | 4 +- OpenCL/m06800.cl | 4 +- OpenCL/m07400.cl | 10 +- OpenCL/m07500_a0.cl | 2 +- OpenCL/m07500_a1.cl | 2 +- OpenCL/m07500_a3.cl | 2 +- OpenCL/m07600_a0.cl | 4 +- OpenCL/m07600_a1.cl | 8 +- OpenCL/m07900.cl | 4 +- OpenCL/m08100_a0.cl | 4 +- OpenCL/m08100_a1.cl | 4 +- OpenCL/m08100_a3.cl | 12 +- OpenCL/m08400_a0.cl | 4 +- OpenCL/m08400_a1.cl | 8 +- OpenCL/m08800.cl | 6 +- OpenCL/m09000.cl | 2 +- OpenCL/m09400.cl | 2 +- OpenCL/m09500.cl | 2 +- OpenCL/m09600.cl | 2 +- OpenCL/m09700_a0.cl | 4 +- OpenCL/m09700_a1.cl | 4 +- OpenCL/m09720_a0.cl | 4 +- OpenCL/m09720_a1.cl | 4 +- OpenCL/m09800_a0.cl | 4 +- OpenCL/m09800_a1.cl | 4 +- OpenCL/m09820_a0.cl | 4 +- OpenCL/m09820_a1.cl | 4 +- OpenCL/m10300.cl | 2 +- OpenCL/m10420_a1.cl | 4 +- OpenCL/m10700.cl | 2 +- OpenCL/m10800_a0.cl | 4 +- OpenCL/m10800_a1.cl | 8 +- OpenCL/m11000_a0.cl | 4 +- OpenCL/m11000_a1.cl | 4 +- OpenCL/m11200_a0.cl | 4 +- OpenCL/m11200_a1.cl | 8 +- OpenCL/m11400_a0.cl | 4 +- OpenCL/m11400_a1.cl | 4 +- OpenCL/m11600.cl | 2 +- OpenCL/m11700_a0.cl | 4 +- OpenCL/m11700_a1.cl | 8 +- OpenCL/m11800_a0.cl | 4 +- OpenCL/m11800_a1.cl | 8 +- OpenCL/m12200.cl | 2 +- OpenCL/m12600_a0.cl | 4 +- OpenCL/m12600_a1.cl | 8 +- OpenCL/m12800.cl | 2 +- 147 files changed, 3523 insertions(+), 3521 deletions(-) diff --git a/OpenCL/common.c b/OpenCL/common.c index 18e67a46c..df7a2bb0f 100644 --- a/OpenCL/common.c +++ b/OpenCL/common.c @@ -3,7 +3,7 @@ * License.....: MIT */ -static int device_memcmp (const u32 d1[4], __global u32 *d2) +static int hash_comp (const u32 d1[4], __global u32 *d2) { if (d1[3] > d2[DGST_R3]) return ( 1); if (d1[3] < d2[DGST_R3]) return (-1); @@ -25,7 +25,7 @@ static int find_hash (const u32 digest[4], const u32 digests_cnt, __global diges const u32 c = l + m; - const int cmp = device_memcmp (digest, digests_buf[c].digest_buf); + const int cmp = hash_comp (digest, digests_buf[c].digest_buf); if (cmp > 0) { @@ -2757,141 +2757,6 @@ static void append_0x80_2x4 (u32 w0[4], u32 w1[4], const u32 offset) } } -// before: append_0x80_2_be -static void append_0x80_2x4_be (u32 w0[4], u32 w1[4], const u32 offset) -{ - switch (offset) - { - case 0: - w0[0] |= 0x80000000; - break; - - case 1: - w0[0] |= 0x800000; - break; - - case 2: - w0[0] |= 0x8000; - break; - - case 3: - w0[0] |= 0x80; - break; - - case 4: - w0[1] |= 0x80000000; - break; - - case 5: - w0[1] |= 0x800000; - break; - - case 6: - w0[1] |= 0x8000; - break; - - case 7: - w0[1] |= 0x80; - break; - - case 8: - w0[2] |= 0x80000000; - break; - - case 9: - w0[2] |= 0x800000; - break; - - case 10: - w0[2] |= 0x8000; - break; - - case 11: - w0[2] |= 0x80; - break; - - case 12: - w0[3] |= 0x80000000; - break; - - case 13: - w0[3] |= 0x800000; - break; - - case 14: - w0[3] |= 0x8000; - break; - - case 15: - w0[3] |= 0x80; - break; - - case 16: - w1[0] |= 0x80000000; - break; - - case 17: - w1[0] |= 0x800000; - break; - - case 18: - w1[0] |= 0x8000; - break; - - case 19: - w1[0] |= 0x80; - break; - - case 20: - w1[1] |= 0x80000000; - break; - - case 21: - w1[1] |= 0x800000; - break; - - case 22: - w1[1] |= 0x8000; - break; - - case 23: - w1[1] |= 0x80; - break; - - case 24: - w1[2] |= 0x80000000; - break; - - case 25: - w1[2] |= 0x800000; - break; - - case 26: - w1[2] |= 0x8000; - break; - - case 27: - w1[2] |= 0x80; - break; - - case 28: - w1[3] |= 0x80000000; - break; - - case 29: - w1[3] |= 0x800000; - break; - - case 30: - w1[3] |= 0x8000; - break; - - case 31: - w1[3] |= 0x80; - break; - } -} - // before: append_0x80_3 static void append_0x80_3x4 (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) { @@ -3873,4108 +3738,4245 @@ static void append_0x80_8x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[ } } -// before: append_0x80_4 -static void append_0x80_1x16 (u32 w[16], const u32 offset) +// before: device_memcat2L +static void memcat_c7_d1x2_sl1x2_sr1x2 (const u32 offset, u32 dst0[2], u32 src_l0[2], u32 src_r0[2]) { switch (offset) { - case 0: - w[ 0] = 0x80; - break; - case 1: - w[ 0] = w[ 0] | 0x8000; + dst0[0] = src_l0[0] | src_r0[0] << 8; + dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8; break; case 2: - w[ 0] = w[ 0] | 0x800000; + dst0[0] = src_l0[0] | src_r0[0] << 16; + dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16; break; case 3: - w[ 0] = w[ 0] | 0x80000000; + dst0[0] = src_l0[0] | src_r0[0] << 24; + dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24; break; case 4: - w[ 1] = 0x80; + dst0[1] = src_r0[0]; break; case 5: - w[ 1] = w[ 1] | 0x8000; + dst0[1] = src_l0[1] | src_r0[0] << 8; break; case 6: - w[ 1] = w[ 1] | 0x800000; + dst0[1] = src_l0[1] | src_r0[0] << 16; break; case 7: - w[ 1] = w[ 1] | 0x80000000; - break; - - case 8: - w[ 2] = 0x80; + dst0[1] = src_l0[1] | src_r0[0] << 24; break; + } +} - case 9: - w[ 2] = w[ 2] | 0x8000; +// before: device_memcat4L +static void memcat_c15_d1x4_sl1x4_sr1x4 (const u32 offset, u32 dst0[4], u32 src_l0[4], u32 src_r0[4]) +{ + switch (offset) + { + case 1: + dst0[0] = src_l0[0] | src_r0[0] << 8; + dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8; + dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8; + dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8; break; - case 10: - w[ 2] = w[ 2] | 0x800000; + case 2: + dst0[0] = src_l0[0] | src_r0[0] << 16; + dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16; + dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16; + dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16; break; - case 11: - w[ 2] = w[ 2] | 0x80000000; + case 3: + dst0[0] = src_l0[0] | src_r0[0] << 24; + dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24; + dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24; + dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24; break; - case 12: - w[ 3] = 0x80; + case 4: + dst0[1] = src_r0[0]; + dst0[2] = src_r0[1]; + dst0[3] = src_r0[2]; break; - case 13: - w[ 3] = w[ 3] | 0x8000; + case 5: + dst0[1] = src_l0[1] | src_r0[0] << 8; + dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8; + dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8; break; - case 14: - w[ 3] = w[ 3] | 0x800000; + case 6: + dst0[1] = src_l0[1] | src_r0[0] << 16; + dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16; + dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16; break; - case 15: - w[ 3] = w[ 3] | 0x80000000; + case 7: + dst0[1] = src_l0[1] | src_r0[0] << 24; + dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24; + dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24; break; - case 16: - w[ 4] = 0x80; + case 8: + dst0[2] = src_r0[0]; + dst0[3] = src_r0[1]; break; - case 17: - w[ 4] = w[ 4] | 0x8000; + case 9: + dst0[2] = src_l0[2] | src_r0[0] << 8; + dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8; break; - case 18: - w[ 4] = w[ 4] | 0x800000; + case 10: + dst0[2] = src_l0[2] | src_r0[0] << 16; + dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16; break; - case 19: - w[ 4] = w[ 4] | 0x80000000; + case 11: + dst0[2] = src_l0[2] | src_r0[0] << 24; + dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24; break; - case 20: - w[ 5] = 0x80; + case 12: + dst0[3] = src_r0[0]; break; - case 21: - w[ 5] = w[ 5] | 0x8000; + case 13: + dst0[3] = src_l0[3] | src_r0[0] << 8; break; - case 22: - w[ 5] = w[ 5] | 0x800000; + case 14: + dst0[3] = src_l0[3] | src_r0[0] << 16; break; - case 23: - w[ 5] = w[ 5] | 0x80000000; + case 15: + dst0[3] = src_l0[3] | src_r0[0] << 24; break; + } +} - case 24: - w[ 6] = 0x80; +// before: device_memcat8L +static void memcat_c31_d2x4_sl2x4_sr1x4 (const u32 offset, u32 dst0[4], u32 dst1[4], u32 src_l0[4], u32 src_l1[4], u32 src_r0[4]) +{ + switch (offset) + { + case 1: + dst0[0] = src_l0[0] | src_r0[0] << 8; + dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8; + dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8; + dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8; + dst1[0] = src_r0[3] >> 24; break; - case 25: - w[ 6] = w[ 6] | 0x8000; + case 2: + dst0[0] = src_l0[0] | src_r0[0] << 16; + dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16; + dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16; + dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16; + dst1[0] = src_r0[3] >> 16; break; - case 26: - w[ 6] = w[ 6] | 0x800000; + case 3: + dst0[0] = src_l0[0] | src_r0[0] << 24; + dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24; + dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24; + dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24; + dst1[0] = src_r0[3] >> 8; break; - case 27: - w[ 6] = w[ 6] | 0x80000000; + case 4: + dst0[1] = src_r0[0]; + dst0[2] = src_r0[1]; + dst0[3] = src_r0[2]; + dst1[0] = src_r0[3]; break; - case 28: - w[ 7] = 0x80; + case 5: + dst0[1] = src_l0[1] | src_r0[0] << 8; + dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8; + dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8; + dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8; + dst1[1] = src_r0[3] >> 24; break; - case 29: - w[ 7] = w[ 7] | 0x8000; + case 6: + dst0[1] = src_l0[1] | src_r0[0] << 16; + dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16; + dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16; + dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16; + dst1[1] = src_r0[3] >> 16; break; - case 30: - w[ 7] = w[ 7] | 0x800000; + case 7: + dst0[1] = src_l0[1] | src_r0[0] << 24; + dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24; + dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24; + dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24; + dst1[1] = src_r0[3] >> 8; break; - case 31: - w[ 7] = w[ 7] | 0x80000000; + case 8: + dst0[2] = src_r0[0]; + dst0[3] = src_r0[1]; + dst1[0] = src_r0[2]; + dst1[1] = src_r0[3]; break; - case 32: - w[ 8] = 0x80; + case 9: + dst0[2] = src_l0[2] | src_r0[0] << 8; + dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8; + dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8; + dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8; + dst1[2] = src_r0[3] >> 24; break; - case 33: - w[ 8] = w[ 8] | 0x8000; + case 10: + dst0[2] = src_l0[2] | src_r0[0] << 16; + dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16; + dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16; + dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16; + dst1[2] = src_r0[3] >> 16; break; - case 34: - w[ 8] = w[ 8] | 0x800000; + case 11: + dst0[2] = src_l0[2] | src_r0[0] << 24; + dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24; + dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24; + dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24; + dst1[2] = src_r0[3] >> 8; break; - case 35: - w[ 8] = w[ 8] | 0x80000000; + case 12: + dst0[3] = src_r0[0]; + dst1[0] = src_r0[1]; + dst1[1] = src_r0[2]; + dst1[2] = src_r0[3]; break; - case 36: - w[ 9] = 0x80; + case 13: + dst0[3] = src_l0[3] | src_r0[0] << 8; + dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8; + dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8; + dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8; + dst1[3] = src_r0[3] >> 24; break; - case 37: - w[ 9] = w[ 9] | 0x8000; + case 14: + dst0[3] = src_l0[3] | src_r0[0] << 16; + dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16; + dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16; + dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16; + dst1[3] = src_r0[3] >> 16; break; - case 38: - w[ 9] = w[ 9] | 0x800000; + case 15: + dst0[3] = src_l0[3] | src_r0[0] << 24; + dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24; + dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24; + dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24; + dst1[3] = src_r0[3] >> 8; break; - case 39: - w[ 9] = w[ 9] | 0x80000000; + case 16: + dst1[0] = src_r0[0]; + dst1[1] = src_r0[1]; + dst1[2] = src_r0[2]; + dst1[3] = src_r0[3]; break; - case 40: - w[10] = 0x80; + case 17: + dst1[0] = src_l1[0] | src_r0[0] << 8; + dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8; + dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8; + dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8; break; - case 41: - w[10] = w[10] | 0x8000; + case 18: + dst1[0] = src_l1[0] | src_r0[0] << 16; + dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16; + dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16; + dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16; break; - case 42: - w[10] = w[10] | 0x800000; + case 19: + dst1[0] = src_l1[0] | src_r0[0] << 24; + dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24; + dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24; + dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24; break; - case 43: - w[10] = w[10] | 0x80000000; + case 20: + dst1[1] = src_r0[0]; + dst1[2] = src_r0[1]; + dst1[3] = src_r0[2]; break; - case 44: - w[11] = 0x80; + case 21: + dst1[1] = src_l1[1] | src_r0[0] << 8; + dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8; + dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8; break; - case 45: - w[11] = w[11] | 0x8000; + case 22: + dst1[1] = src_l1[1] | src_r0[0] << 16; + dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16; + dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16; break; - case 46: - w[11] = w[11] | 0x800000; + case 23: + dst1[1] = src_l1[1] | src_r0[0] << 24; + dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24; + dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24; break; - case 47: - w[11] = w[11] | 0x80000000; + case 24: + dst1[2] = src_r0[0]; + dst1[3] = src_r0[1]; break; - case 48: - w[12] = 0x80; + case 25: + dst1[2] = src_l1[2] | src_r0[0] << 8; + dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8; break; - case 49: - w[12] = w[12] | 0x8000; + case 26: + dst1[2] = src_l1[2] | src_r0[0] << 16; + dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16; break; - case 50: - w[12] = w[12] | 0x800000; + case 27: + dst1[2] = src_l1[2] | src_r0[0] << 24; + dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24; break; - case 51: - w[12] = w[12] | 0x80000000; + case 28: + dst1[3] = src_r0[0]; break; - case 52: - w[13] = 0x80; + case 29: + dst1[3] = src_l1[3] | src_r0[0] << 8; break; - case 53: - w[13] = w[13] | 0x8000; + case 30: + dst1[3] = src_l1[3] | src_r0[0] << 16; break; - case 54: - w[13] = w[13] | 0x800000; + case 31: + dst1[3] = src_l1[3] | src_r0[0] << 24; break; + } +} - case 55: - w[13] = w[13] | 0x80000000; +// before: device_memcat12L +static void memcat_c47_d3x4_sl3x4_sr1x4 (const u32 offset, u32 dst0[4], u32 dst1[4], u32 dst2[4], u32 src_l0[4], u32 src_l1[4], u32 src_l2[4], u32 src_r0[4]) +{ + switch (offset) + { + case 1: + dst0[0] = src_l0[0] | src_r0[0] << 8; + dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8; + dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8; + dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8; + dst1[0] = src_r0[3] >> 24; break; - case 56: - w[14] = 0x80; + case 2: + dst0[0] = src_l0[0] | src_r0[0] << 16; + dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16; + dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16; + dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16; + dst1[0] = src_r0[3] >> 16; break; - case 57: - w[14] = w[14] | 0x8000; + case 3: + dst0[0] = src_l0[0] | src_r0[0] << 24; + dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24; + dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24; + dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24; + dst1[0] = src_r0[3] >> 8; break; - case 58: - w[14] = w[14] | 0x800000; + case 4: + dst0[1] = src_r0[0]; + dst0[2] = src_r0[1]; + dst0[3] = src_r0[2]; + dst1[0] = src_r0[3]; break; - case 59: - w[14] = w[14] | 0x80000000; - break; - - case 60: - w[15] = 0x80; - break; - - case 61: - w[15] = w[15] | 0x8000; - break; - - case 62: - w[15] = w[15] | 0x800000; - break; - - case 63: - w[15] = w[15] | 0x80000000; - break; - } -} - -// before: append_0x80_8 -static void append_0x80_1x32 (u32 w[32], const u32 offset) -{ - switch (offset) - { - case 0: - w[ 0] = 0x80; - break; - - case 1: - w[ 0] = w[ 0] | 0x8000; - break; - - case 2: - w[ 0] = w[ 0] | 0x800000; - break; - - case 3: - w[ 0] = w[ 0] | 0x80000000; - break; - - case 4: - w[ 1] = 0x80; - break; - - case 5: - w[ 1] = w[ 1] | 0x8000; + case 5: + dst0[1] = src_l0[1] | src_r0[0] << 8; + dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8; + dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8; + dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8; + dst1[1] = src_r0[3] >> 24; break; case 6: - w[ 1] = w[ 1] | 0x800000; + dst0[1] = src_l0[1] | src_r0[0] << 16; + dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16; + dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16; + dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16; + dst1[1] = src_r0[3] >> 16; break; case 7: - w[ 1] = w[ 1] | 0x80000000; + dst0[1] = src_l0[1] | src_r0[0] << 24; + dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24; + dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24; + dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24; + dst1[1] = src_r0[3] >> 8; break; case 8: - w[ 2] = 0x80; + dst0[2] = src_r0[0]; + dst0[3] = src_r0[1]; + dst1[0] = src_r0[2]; + dst1[1] = src_r0[3]; break; case 9: - w[ 2] = w[ 2] | 0x8000; + dst0[2] = src_l0[2] | src_r0[0] << 8; + dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8; + dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8; + dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8; + dst1[2] = src_r0[3] >> 24; break; case 10: - w[ 2] = w[ 2] | 0x800000; + dst0[2] = src_l0[2] | src_r0[0] << 16; + dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16; + dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16; + dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16; + dst1[2] = src_r0[3] >> 16; break; case 11: - w[ 2] = w[ 2] | 0x80000000; + dst0[2] = src_l0[2] | src_r0[0] << 24; + dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24; + dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24; + dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24; + dst1[2] = src_r0[3] >> 8; break; case 12: - w[ 3] = 0x80; + dst0[3] = src_r0[0]; + dst1[0] = src_r0[1]; + dst1[1] = src_r0[2]; + dst1[2] = src_r0[3]; break; case 13: - w[ 3] = w[ 3] | 0x8000; + dst0[3] = src_l0[3] | src_r0[0] << 8; + dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8; + dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8; + dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8; + dst1[3] = src_r0[3] >> 24; break; case 14: - w[ 3] = w[ 3] | 0x800000; + dst0[3] = src_l0[3] | src_r0[0] << 16; + dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16; + dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16; + dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16; + dst1[3] = src_r0[3] >> 16; break; case 15: - w[ 3] = w[ 3] | 0x80000000; + dst0[3] = src_l0[3] | src_r0[0] << 24; + dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24; + dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24; + dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24; + dst1[3] = src_r0[3] >> 8; break; case 16: - w[ 4] = 0x80; + dst1[0] = src_r0[0]; + dst1[1] = src_r0[1]; + dst1[2] = src_r0[2]; + dst1[3] = src_r0[3]; break; case 17: - w[ 4] = w[ 4] | 0x8000; + dst1[0] = src_l1[0] | src_r0[0] << 8; + dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8; + dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8; + dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8; + dst2[0] = src_r0[3] >> 24; break; case 18: - w[ 4] = w[ 4] | 0x800000; + dst1[0] = src_l1[0] | src_r0[0] << 16; + dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16; + dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16; + dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16; + dst2[0] = src_r0[3] >> 16; break; case 19: - w[ 4] = w[ 4] | 0x80000000; + dst1[0] = src_l1[0] | src_r0[0] << 24; + dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24; + dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24; + dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24; + dst2[0] = src_r0[3] >> 8; break; case 20: - w[ 5] = 0x80; + dst1[1] = src_r0[0]; + dst1[2] = src_r0[1]; + dst1[3] = src_r0[2]; + dst2[0] = src_r0[3]; break; case 21: - w[ 5] = w[ 5] | 0x8000; + dst1[1] = src_l1[1] | src_r0[0] << 8; + dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8; + dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8; + dst2[0] = src_r0[2] >> 24 | src_r0[3] << 8; + dst2[1] = src_r0[3] >> 24; break; case 22: - w[ 5] = w[ 5] | 0x800000; + dst1[1] = src_l1[1] | src_r0[0] << 16; + dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16; + dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16; + dst2[0] = src_r0[2] >> 16 | src_r0[3] << 16; + dst2[1] = src_r0[3] >> 16; break; case 23: - w[ 5] = w[ 5] | 0x80000000; + dst1[1] = src_l1[1] | src_r0[0] << 24; + dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24; + dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24; + dst2[0] = src_r0[2] >> 8 | src_r0[3] << 24; + dst2[1] = src_r0[3] >> 8; break; case 24: - w[ 6] = 0x80; + dst1[2] = src_r0[0]; + dst1[3] = src_r0[1]; + dst2[0] = src_r0[2]; + dst2[1] = src_r0[3]; break; case 25: - w[ 6] = w[ 6] | 0x8000; + dst1[2] = src_l1[2] | src_r0[0] << 8; + dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8; + dst2[0] = src_r0[1] >> 24 | src_r0[2] << 8; + dst2[1] = src_r0[2] >> 24 | src_r0[3] << 8; + dst2[2] = src_r0[3] >> 24; break; case 26: - w[ 6] = w[ 6] | 0x800000; + dst1[2] = src_l1[2] | src_r0[0] << 16; + dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16; + dst2[0] = src_r0[1] >> 16 | src_r0[2] << 16; + dst2[1] = src_r0[2] >> 16 | src_r0[3] << 16; + dst2[2] = src_r0[3] >> 16; break; case 27: - w[ 6] = w[ 6] | 0x80000000; + dst1[2] = src_l1[2] | src_r0[0] << 24; + dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24; + dst2[0] = src_r0[1] >> 8 | src_r0[2] << 24; + dst2[1] = src_r0[2] >> 8 | src_r0[3] << 24; + dst2[2] = src_r0[3] >> 8; break; case 28: - w[ 7] = 0x80; + dst1[3] = src_r0[0]; + dst2[0] = src_r0[1]; + dst2[1] = src_r0[2]; + dst2[2] = src_r0[3]; break; case 29: - w[ 7] = w[ 7] | 0x8000; + dst1[3] = src_l1[3] | src_r0[0] << 8; + dst2[0] = src_r0[0] >> 24 | src_r0[1] << 8; + dst2[1] = src_r0[1] >> 24 | src_r0[2] << 8; + dst2[2] = src_r0[2] >> 24 | src_r0[3] << 8; + dst2[3] = src_r0[3] >> 24; break; case 30: - w[ 7] = w[ 7] | 0x800000; + dst1[3] = src_l1[3] | src_r0[0] << 16; + dst2[0] = src_r0[0] >> 16 | src_r0[1] << 16; + dst2[1] = src_r0[1] >> 16 | src_r0[2] << 16; + dst2[2] = src_r0[2] >> 16 | src_r0[3] << 16; + dst2[3] = src_r0[3] >> 16; break; case 31: - w[ 7] = w[ 7] | 0x80000000; + dst1[3] = src_l1[3] | src_r0[0] << 24; + dst2[0] = src_r0[0] >> 8 | src_r0[1] << 24; + dst2[1] = src_r0[1] >> 8 | src_r0[2] << 24; + dst2[2] = src_r0[2] >> 8 | src_r0[3] << 24; + dst2[3] = src_r0[3] >> 8; break; case 32: - w[ 8] = 0x80; + dst2[0] = src_r0[0]; + dst2[1] = src_r0[1]; + dst2[2] = src_r0[2]; + dst2[3] = src_r0[3]; break; case 33: - w[ 8] = w[ 8] | 0x8000; + dst2[0] = src_l2[0] | src_r0[0] << 8; + dst2[1] = src_r0[0] >> 24 | src_r0[1] << 8; + dst2[2] = src_r0[1] >> 24 | src_r0[2] << 8; + dst2[3] = src_r0[2] >> 24 | src_r0[3] << 8; break; case 34: - w[ 8] = w[ 8] | 0x800000; + dst2[0] = src_l2[0] | src_r0[0] << 16; + dst2[1] = src_r0[0] >> 16 | src_r0[1] << 16; + dst2[2] = src_r0[1] >> 16 | src_r0[2] << 16; + dst2[3] = src_r0[2] >> 16 | src_r0[3] << 16; break; case 35: - w[ 8] = w[ 8] | 0x80000000; + dst2[0] = src_l2[0] | src_r0[0] << 24; + dst2[1] = src_r0[0] >> 8 | src_r0[1] << 24; + dst2[2] = src_r0[1] >> 8 | src_r0[2] << 24; + dst2[3] = src_r0[2] >> 8 | src_r0[3] << 24; break; case 36: - w[ 9] = 0x80; + dst2[1] = src_r0[0]; + dst2[2] = src_r0[1]; + dst2[3] = src_r0[2]; break; case 37: - w[ 9] = w[ 9] | 0x8000; + dst2[1] = src_l2[1] | src_r0[0] << 8; + dst2[2] = src_r0[0] >> 24 | src_r0[1] << 8; + dst2[3] = src_r0[1] >> 24 | src_r0[2] << 8; break; case 38: - w[ 9] = w[ 9] | 0x800000; + dst2[1] = src_l2[1] | src_r0[0] << 16; + dst2[2] = src_r0[0] >> 16 | src_r0[1] << 16; + dst2[3] = src_r0[1] >> 16 | src_r0[2] << 16; break; case 39: - w[ 9] = w[ 9] | 0x80000000; + dst2[1] = src_l2[1] | src_r0[0] << 24; + dst2[2] = src_r0[0] >> 8 | src_r0[1] << 24; + dst2[3] = src_r0[1] >> 8 | src_r0[2] << 24; break; case 40: - w[10] = 0x80; + dst2[2] = src_r0[0]; + dst2[3] = src_r0[1]; break; case 41: - w[10] = w[10] | 0x8000; + dst2[2] = src_l2[2] | src_r0[0] << 8; + dst2[3] = src_r0[0] >> 24 | src_r0[1] << 8; break; case 42: - w[10] = w[10] | 0x800000; + dst2[2] = src_l2[2] | src_r0[0] << 16; + dst2[3] = src_r0[0] >> 16 | src_r0[1] << 16; break; case 43: - w[10] = w[10] | 0x80000000; + dst2[2] = src_l2[2] | src_r0[0] << 24; + dst2[3] = src_r0[0] >> 8 | src_r0[1] << 24; break; case 44: - w[11] = 0x80; + dst2[3] = src_r0[0]; break; case 45: - w[11] = w[11] | 0x8000; + dst2[3] = src_l2[3] | src_r0[0] << 8; break; case 46: - w[11] = w[11] | 0x800000; + dst2[3] = src_l2[3] | src_r0[0] << 16; break; case 47: - w[11] = w[11] | 0x80000000; + dst2[3] = src_l2[3] | src_r0[0] << 24; break; + } +} - case 48: - w[12] = 0x80; - break; +// before: device_memcat12L +static void memcat_c47_d3x4_sl3x4_sr2x4 (const u32 offset, u32 dst0[4], u32 dst1[4], u32 dst2[4], u32 src_l0[4], u32 src_l1[4], u32 src_l2[4], u32 src_r0[4], u32 src_r1[4]) +{ + switch (offset) + { + case 0: + dst0[0] = src_r0[0]; + dst0[1] = src_r0[1]; + dst0[2] = src_r0[2]; + dst0[3] = src_r0[3]; + dst1[0] = src_r1[0]; + dst1[1] = src_r1[1]; + dst1[2] = src_r1[2]; + dst1[3] = src_r1[3]; + break; - case 49: - w[12] = w[12] | 0x8000; + case 1: + dst0[0] = src_l0[0] | src_r0[0] << 8; + dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8; + dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8; + dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8; + dst1[0] = src_r0[3] >> 24 | src_r1[0] << 8; + dst1[1] = src_r1[0] >> 24 | src_r1[1] << 8; + dst1[2] = src_r1[1] >> 24 | src_r1[2] << 8; + dst1[3] = src_r1[2] >> 24 | src_r1[3] << 8; + dst2[0] = src_r1[3] >> 24; break; - case 50: - w[12] = w[12] | 0x800000; + case 2: + dst0[0] = src_l0[0] | src_r0[0] << 16; + dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16; + dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16; + dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16; + dst1[0] = src_r0[3] >> 16 | src_r1[0] << 16; + dst1[1] = src_r1[0] >> 16 | src_r1[1] << 16; + dst1[2] = src_r1[1] >> 16 | src_r1[2] << 16; + dst1[3] = src_r1[2] >> 16 | src_r1[3] << 16; + dst2[0] = src_r1[3] >> 16; break; - case 51: - w[12] = w[12] | 0x80000000; + case 3: + dst0[0] = src_l0[0] | src_r0[0] << 24; + dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24; + dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24; + dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24; + dst1[0] = src_r0[3] >> 8 | src_r1[0] << 24; + dst1[1] = src_r1[0] >> 8 | src_r1[1] << 24; + dst1[2] = src_r1[1] >> 8 | src_r1[2] << 24; + dst1[3] = src_r1[2] >> 8 | src_r1[3] << 24; + dst2[0] = src_r1[3] >> 8; break; - case 52: - w[13] = 0x80; + case 4: + dst0[1] = src_r0[0]; + dst0[2] = src_r0[1]; + dst0[3] = src_r0[2]; + dst1[0] = src_r0[3]; + dst1[1] = src_r1[0]; + dst1[2] = src_r1[1]; + dst1[3] = src_r1[2]; + dst2[0] = src_r1[3]; break; - case 53: - w[13] = w[13] | 0x8000; + case 5: + dst0[1] = src_l0[1] | src_r0[0] << 8; + dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8; + dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8; + dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8; + dst1[1] = src_r0[3] >> 24 | src_r1[0] << 8; + dst1[2] = src_r1[0] >> 24 | src_r1[1] << 8; + dst1[3] = src_r1[1] >> 24 | src_r1[2] << 8; + dst2[0] = src_r1[2] >> 24 | src_r1[3] << 8; + dst2[1] = src_r1[3] >> 24; break; - case 54: - w[13] = w[13] | 0x800000; + case 6: + dst0[1] = src_l0[1] | src_r0[0] << 16; + dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16; + dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16; + dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16; + dst1[1] = src_r0[3] >> 16 | src_r1[0] << 16; + dst1[2] = src_r1[0] >> 16 | src_r1[1] << 16; + dst1[3] = src_r1[1] >> 16 | src_r1[2] << 16; + dst2[0] = src_r1[2] >> 16 | src_r1[3] << 16; + dst2[1] = src_r1[3] >> 16; break; - case 55: - w[13] = w[13] | 0x80000000; + case 7: + dst0[1] = src_l0[1] | src_r0[0] << 24; + dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24; + dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24; + dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24; + dst1[1] = src_r0[3] >> 8 | src_r1[0] << 24; + dst1[2] = src_r1[0] >> 8 | src_r1[1] << 24; + dst1[3] = src_r1[1] >> 8 | src_r1[2] << 24; + dst2[0] = src_r1[2] >> 8 | src_r1[3] << 24; + dst2[1] = src_r1[3] >> 8; break; - case 56: - w[14] = 0x80; + case 8: + dst0[2] = src_r0[0]; + dst0[3] = src_r0[1]; + dst1[0] = src_r0[2]; + dst1[1] = src_r0[3]; + dst1[2] = src_r1[0]; + dst1[3] = src_r1[1]; + dst2[0] = src_r1[2]; + dst2[1] = src_r1[3]; break; - case 57: - w[14] = w[14] | 0x8000; + case 9: + dst0[2] = src_l0[2] | src_r0[0] << 8; + dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8; + dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8; + dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8; + dst1[2] = src_r0[3] >> 24 | src_r1[0] << 8; + dst1[3] = src_r1[0] >> 24 | src_r1[1] << 8; + dst2[0] = src_r1[1] >> 24 | src_r1[2] << 8; + dst2[1] = src_r1[2] >> 24 | src_r1[3] << 8; + dst2[2] = src_r1[3] >> 24; break; - case 58: - w[14] = w[14] | 0x800000; + case 10: + dst0[2] = src_l0[2] | src_r0[0] << 16; + dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16; + dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16; + dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16; + dst1[2] = src_r0[3] >> 16 | src_r1[0] << 16; + dst1[3] = src_r1[0] >> 16 | src_r1[1] << 16; + dst2[0] = src_r1[1] >> 16 | src_r1[2] << 16; + dst2[1] = src_r1[2] >> 16 | src_r1[3] << 16; + dst2[2] = src_r1[3] >> 16; break; - case 59: - w[14] = w[14] | 0x80000000; + case 11: + dst0[2] = src_l0[2] | src_r0[0] << 24; + dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24; + dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24; + dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24; + dst1[2] = src_r0[3] >> 8 | src_r1[0] << 24; + dst1[3] = src_r1[0] >> 8 | src_r1[1] << 24; + dst2[0] = src_r1[1] >> 8 | src_r1[2] << 24; + dst2[1] = src_r1[2] >> 8 | src_r1[3] << 24; + dst2[2] = src_r1[3] >> 8; break; - case 60: - w[15] = 0x80; + case 12: + dst0[3] = src_r0[0]; + dst1[0] = src_r0[1]; + dst1[1] = src_r0[2]; + dst1[2] = src_r0[3]; + dst1[3] = src_r1[0]; + dst2[0] = src_r1[1]; + dst2[1] = src_r1[2]; + dst2[2] = src_r1[3]; break; - case 61: - w[15] = w[15] | 0x8000; + case 13: + dst0[3] = src_l0[3] | src_r0[0] << 8; + dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8; + dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8; + dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8; + dst1[3] = src_r0[3] >> 24 | src_r1[0] << 8; + dst2[0] = src_r1[0] >> 24 | src_r1[1] << 8; + dst2[1] = src_r1[1] >> 24 | src_r1[2] << 8; + dst2[2] = src_r1[2] >> 24 | src_r1[3] << 8; + dst2[3] = src_r1[3] >> 24; break; - case 62: - w[15] = w[15] | 0x800000; + case 14: + dst0[3] = src_l0[3] | src_r0[0] << 16; + dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16; + dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16; + dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16; + dst1[3] = src_r0[3] >> 16 | src_r1[0] << 16; + dst2[0] = src_r1[0] >> 16 | src_r1[1] << 16; + dst2[1] = src_r1[1] >> 16 | src_r1[2] << 16; + dst2[2] = src_r1[2] >> 16 | src_r1[3] << 16; + dst2[3] = src_r1[3] >> 16; break; - case 63: - w[15] = w[15] | 0x80000000; + case 15: + dst0[3] = src_l0[3] | src_r0[0] << 24; + dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24; + dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24; + dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24; + dst1[3] = src_r0[3] >> 8 | src_r1[0] << 24; + dst2[0] = src_r1[0] >> 8 | src_r1[1] << 24; + dst2[1] = src_r1[1] >> 8 | src_r1[2] << 24; + dst2[2] = src_r1[2] >> 8 | src_r1[3] << 24; + dst2[3] = src_r1[3] >> 8; break; - case 64: - w[16] = 0x80; + case 16: + dst1[0] = src_r0[0]; + dst1[1] = src_r0[1]; + dst1[2] = src_r0[2]; + dst1[3] = src_r0[3]; + dst2[0] = src_r1[0]; + dst2[1] = src_r1[1]; + dst2[2] = src_r1[2]; + dst2[3] = src_r1[3]; break; - case 65: - w[16] = w[16] | 0x8000; + case 17: + dst1[0] = src_l1[0] | src_r0[0] << 8; + dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8; + dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8; + dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8; + dst2[0] = src_r0[3] >> 24 | src_r1[0] << 8; + dst2[1] = src_r1[0] >> 24 | src_r1[1] << 8; + dst2[2] = src_r1[1] >> 24 | src_r1[2] << 8; + dst2[3] = src_r1[2] >> 24 | src_r1[3] << 8; break; - case 66: - w[16] = w[16] | 0x800000; + case 18: + dst1[0] = src_l1[0] | src_r0[0] << 16; + dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16; + dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16; + dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16; + dst2[0] = src_r0[3] >> 16 | src_r1[0] << 16; + dst2[1] = src_r1[0] >> 16 | src_r1[1] << 16; + dst2[2] = src_r1[1] >> 16 | src_r1[2] << 16; + dst2[3] = src_r1[2] >> 16 | src_r1[3] << 16; break; - case 67: - w[16] = w[16] | 0x80000000; + case 19: + dst1[0] = src_l1[0] | src_r0[0] << 24; + dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24; + dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24; + dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24; + dst2[0] = src_r0[3] >> 8 | src_r1[0] << 24; + dst2[1] = src_r1[0] >> 8 | src_r1[1] << 24; + dst2[2] = src_r1[1] >> 8 | src_r1[2] << 24; + dst2[3] = src_r1[2] >> 8 | src_r1[3] << 24; break; - case 68: - w[17] = 0x80; + case 20: + dst1[1] = src_r1[0]; + dst1[2] = src_r0[1]; + dst1[3] = src_r0[2]; + dst2[0] = src_r0[3]; + dst2[1] = src_r1[0]; + dst2[2] = src_r1[1]; + dst2[3] = src_r1[2]; break; - case 69: - w[17] = w[17] | 0x8000; + case 21: + dst1[1] = src_l1[1] | src_r0[0] << 8; + dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8; + dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8; + dst2[0] = src_r0[2] >> 24 | src_r0[3] << 8; + dst2[1] = src_r0[3] >> 24 | src_r1[0] << 8; + dst2[2] = src_r1[0] >> 24 | src_r1[1] << 8; + dst2[3] = src_r1[1] >> 24 | src_r1[2] << 8; break; - case 70: - w[17] = w[17] | 0x800000; - break; - - case 71: - w[17] = w[17] | 0x80000000; - break; - - case 72: - w[18] = 0x80; + case 22: + dst1[1] = src_l1[1] | src_r0[0] << 16; + dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16; + dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16; + dst2[0] = src_r0[2] >> 16 | src_r0[3] << 16; + dst2[1] = src_r0[3] >> 16 | src_r1[0] << 16; + dst2[2] = src_r1[0] >> 16 | src_r1[1] << 16; + dst2[3] = src_r1[1] >> 16 | src_r1[2] << 16; break; - case 73: - w[18] = w[18] | 0x8000; + case 23: + dst1[1] = src_l1[1] | src_r0[0] << 24; + dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24; + dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24; + dst2[0] = src_r0[2] >> 8 | src_r0[3] << 24; + dst2[1] = src_r0[3] >> 8 | src_r1[0] << 24; + dst2[2] = src_r1[0] >> 8 | src_r1[1] << 24; + dst2[3] = src_r1[1] >> 8 | src_r1[2] << 24; break; - case 74: - w[18] = w[18] | 0x800000; + case 24: + dst1[2] = src_r1[0]; + dst1[3] = src_r0[1]; + dst2[0] = src_r0[2]; + dst2[1] = src_r0[3]; + dst2[2] = src_r1[0]; + dst2[3] = src_r1[1]; break; - case 75: - w[18] = w[18] | 0x80000000; + case 25: + dst1[2] = src_l1[2] | src_r0[0] << 8; + dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8; + dst2[0] = src_r0[1] >> 24 | src_r0[2] << 8; + dst2[1] = src_r0[2] >> 24 | src_r0[3] << 8; + dst2[2] = src_r0[3] >> 24 | src_r1[0] << 8; + dst2[3] = src_r1[0] >> 24 | src_r1[1] << 8; break; - case 76: - w[19] = 0x80; + case 26: + dst1[2] = src_l1[2] | src_r0[0] << 16; + dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16; + dst2[0] = src_r0[1] >> 16 | src_r0[2] << 16; + dst2[1] = src_r0[2] >> 16 | src_r0[3] << 16; + dst2[2] = src_r0[3] >> 16 | src_r1[0] << 16; + dst2[3] = src_r1[0] >> 16 | src_r1[1] << 16; break; - case 77: - w[19] = w[19] | 0x8000; + case 27: + dst1[2] = src_l1[2] | src_r0[0] << 24; + dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24; + dst2[0] = src_r0[1] >> 8 | src_r0[2] << 24; + dst2[1] = src_r0[2] >> 8 | src_r0[3] << 24; + dst2[2] = src_r0[3] >> 8 | src_r1[0] << 24; + dst2[3] = src_r1[0] >> 8 | src_r1[1] << 24; break; - case 78: - w[19] = w[19] | 0x800000; + case 28: + dst1[3] = src_r1[0]; + dst2[0] = src_r0[1]; + dst2[1] = src_r0[2]; + dst2[2] = src_r0[3]; + dst2[3] = src_r1[0]; break; - case 79: - w[19] = w[19] | 0x80000000; + case 29: + dst1[3] = src_l1[3] | src_r0[0] << 8; + dst2[0] = src_r0[0] >> 24 | src_r0[1] << 8; + dst2[1] = src_r0[1] >> 24 | src_r0[2] << 8; + dst2[2] = src_r0[2] >> 24 | src_r0[3] << 8; + dst2[3] = src_r0[3] >> 24 | src_r1[0] << 8; break; - case 80: - w[20] = 0x80; + case 30: + dst1[3] = src_l1[3] | src_r0[0] << 16; + dst2[0] = src_r0[0] >> 16 | src_r0[1] << 16; + dst2[1] = src_r0[1] >> 16 | src_r0[2] << 16; + dst2[2] = src_r0[2] >> 16 | src_r0[3] << 16; + dst2[3] = src_r0[3] >> 16 | src_r1[0] << 16; break; - case 81: - w[20] = w[20] | 0x8000; + case 31: + dst1[3] = src_l1[3] | src_r0[0] << 24; + dst2[0] = src_r0[0] >> 8 | src_r0[1] << 24; + dst2[1] = src_r0[1] >> 8 | src_r0[2] << 24; + dst2[2] = src_r0[2] >> 8 | src_r0[3] << 24; + dst2[3] = src_r0[3] >> 8 | src_r1[0] << 24; break; - case 82: - w[20] = w[20] | 0x800000; + case 32: + dst2[0] = src_r0[0]; + dst2[1] = src_r0[1]; + dst2[2] = src_r0[2]; + dst2[3] = src_r0[3]; break; - case 83: - w[20] = w[20] | 0x80000000; + case 33: + dst2[0] = src_l2[0] | src_r0[0] << 8; + dst2[1] = src_r0[0] >> 24 | src_r0[1] << 8; + dst2[2] = src_r0[1] >> 24 | src_r0[2] << 8; + dst2[3] = src_r0[2] >> 24 | src_r0[3] << 8; break; - case 84: - w[21] = 0x80; + case 34: + dst2[0] = src_l2[0] | src_r0[0] << 16; + dst2[1] = src_r0[0] >> 16 | src_r0[1] << 16; + dst2[2] = src_r0[1] >> 16 | src_r0[2] << 16; + dst2[3] = src_r0[2] >> 16 | src_r0[3] << 16; break; - case 85: - w[21] = w[21] | 0x8000; + case 35: + dst2[0] = src_l2[0] | src_r0[0] << 24; + dst2[1] = src_r0[0] >> 8 | src_r0[1] << 24; + dst2[2] = src_r0[1] >> 8 | src_r0[2] << 24; + dst2[3] = src_r0[2] >> 8 | src_r0[3] << 24; break; - case 86: - w[21] = w[21] | 0x800000; + case 36: + dst2[1] = src_r0[0]; + dst2[2] = src_r0[1]; + dst2[3] = src_r0[2]; break; - case 87: - w[21] = w[21] | 0x80000000; + case 37: + dst2[1] = src_l2[1] | src_r0[0] << 8; + dst2[2] = src_r0[0] >> 24 | src_r0[1] << 8; + dst2[3] = src_r0[1] >> 24 | src_r0[2] << 8; break; - case 88: - w[22] = 0x80; + case 38: + dst2[1] = src_l2[1] | src_r0[0] << 16; + dst2[2] = src_r0[0] >> 16 | src_r0[1] << 16; + dst2[3] = src_r0[1] >> 16 | src_r0[2] << 16; break; - case 89: - w[22] = w[22] | 0x8000; + case 39: + dst2[1] = src_l2[1] | src_r0[0] << 24; + dst2[2] = src_r0[0] >> 8 | src_r0[1] << 24; + dst2[3] = src_r0[1] >> 8 | src_r0[2] << 24; break; - case 90: - w[22] = w[22] | 0x800000; + case 40: + dst2[2] = src_r0[0]; + dst2[3] = src_r0[1]; break; - case 91: - w[22] = w[22] | 0x80000000; + case 41: + dst2[2] = src_l2[2] | src_r0[0] << 8; + dst2[3] = src_r0[0] >> 24 | src_r0[1] << 8; break; - case 92: - w[23] = 0x80; + case 42: + dst2[2] = src_l2[2] | src_r0[0] << 16; + dst2[3] = src_r0[0] >> 16 | src_r0[1] << 16; break; - case 93: - w[23] = w[23] | 0x8000; + case 43: + dst2[2] = src_l2[2] | src_r0[0] << 24; + dst2[3] = src_r0[0] >> 8 | src_r0[1] << 24; break; - case 94: - w[23] = w[23] | 0x800000; + case 44: + dst2[3] = src_r0[0]; break; - case 95: - w[23] = w[23] | 0x80000000; + case 45: + dst2[3] = src_l2[3] | src_r0[0] << 8; break; - case 96: - w[24] = 0x80; + case 46: + dst2[3] = src_l2[3] | src_r0[0] << 16; break; - case 97: - w[24] = w[24] | 0x8000; + case 47: + dst2[3] = src_l2[3] | src_r0[0] << 24; break; + } +} - case 98: - w[24] = w[24] | 0x800000; +// before: memcat16_9 +static void memcat_c15_w4x4_a3x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 append0[4], const u32 append1[4], const u32 append2[4], const u32 offset) +{ + switch (offset) + { + case 0: + w0[0] = append0[0]; + w0[1] = append0[1]; + w0[2] = append0[2]; + w0[3] = append0[3]; + w1[0] = append1[0]; + w1[1] = append1[1]; + w1[2] = append1[2]; + w1[3] = append1[3]; + w2[0] = append2[0]; break; - case 99: - w[24] = w[24] | 0x80000000; + case 1: + w0[0] = w0[0] | append0[0] << 8; + w0[1] = append0[0] >> 24 | append0[1] << 8; + w0[2] = append0[1] >> 24 | append0[2] << 8; + w0[3] = append0[2] >> 24 | append0[3] << 8; + w1[0] = append0[3] >> 24 | append1[0] << 8; + w1[1] = append1[0] >> 24 | append1[1] << 8; + w1[2] = append1[1] >> 24 | append1[2] << 8; + w1[3] = append1[2] >> 24 | append1[3] << 8; + w2[0] = append1[3] >> 24 | append2[0] << 8; + w2[1] = append2[0] >> 24; break; - case 100: - w[25] = 0x80; + case 2: + w0[0] = w0[0] | append0[0] << 16; + w0[1] = append0[0] >> 16 | append0[1] << 16; + w0[2] = append0[1] >> 16 | append0[2] << 16; + w0[3] = append0[2] >> 16 | append0[3] << 16; + w1[0] = append0[3] >> 16 | append1[0] << 16; + w1[1] = append1[0] >> 16 | append1[1] << 16; + w1[2] = append1[1] >> 16 | append1[2] << 16; + w1[3] = append1[2] >> 16 | append1[3] << 16; + w2[0] = append1[3] >> 16 | append2[0] << 16; + w2[1] = append2[0] >> 16; break; - case 101: - w[25] = w[25] | 0x8000; + case 3: + w0[0] = w0[0] | append0[0] << 24; + w0[1] = append0[0] >> 8 | append0[1] << 24; + w0[2] = append0[1] >> 8 | append0[2] << 24; + w0[3] = append0[2] >> 8 | append0[3] << 24; + w1[0] = append0[3] >> 8 | append1[0] << 24; + w1[1] = append1[0] >> 8 | append1[1] << 24; + w1[2] = append1[1] >> 8 | append1[2] << 24; + w1[3] = append1[2] >> 8 | append1[3] << 24; + w2[0] = append1[3] >> 8 | append2[0] << 24; + w2[1] = append2[0] >> 8; break; - case 102: - w[25] = w[25] | 0x800000; + case 4: + w0[1] = append0[0]; + w0[2] = append0[1]; + w0[3] = append0[2]; + w1[0] = append0[3]; + w1[1] = append1[0]; + w1[2] = append1[1]; + w1[3] = append1[2]; + w2[0] = append1[3]; + w2[1] = append2[0]; break; - case 103: - w[25] = w[25] | 0x80000000; - break; - - case 104: - w[26] = 0x80; - break; - - case 105: - w[26] = w[26] | 0x8000; - break; - - case 106: - w[26] = w[26] | 0x800000; - break; - - case 107: - w[26] = w[26] | 0x80000000; - break; - - case 108: - w[27] = 0x80; - break; - - case 109: - w[27] = w[27] | 0x8000; - break; - - case 110: - w[27] = w[27] | 0x800000; - break; - - case 111: - w[27] = w[27] | 0x80000000; - break; - - case 112: - w[28] = 0x80; - break; - - case 113: - w[28] = w[28] | 0x8000; - break; - - case 114: - w[28] = w[28] | 0x800000; - break; - - case 115: - w[28] = w[28] | 0x80000000; - break; - - case 116: - w[29] = 0x80; - break; - - case 117: - w[29] = w[29] | 0x8000; + case 5: + w0[1] = w0[1] | append0[0] << 8; + w0[2] = append0[0] >> 24 | append0[1] << 8; + w0[3] = append0[1] >> 24 | append0[2] << 8; + w1[0] = append0[2] >> 24 | append0[3] << 8; + w1[1] = append0[3] >> 24 | append1[0] << 8; + w1[2] = append1[0] >> 24 | append1[1] << 8; + w1[3] = append1[1] >> 24 | append1[2] << 8; + w2[0] = append1[2] >> 24 | append1[3] << 8; + w2[1] = append1[3] >> 24 | append2[0] << 8; + w2[2] = append2[0] >> 24; break; - case 118: - w[29] = w[29] | 0x800000; + case 6: + w0[1] = w0[1] | append0[0] << 16; + w0[2] = append0[0] >> 16 | append0[1] << 16; + w0[3] = append0[1] >> 16 | append0[2] << 16; + w1[0] = append0[2] >> 16 | append0[3] << 16; + w1[1] = append0[3] >> 16 | append1[0] << 16; + w1[2] = append1[0] >> 16 | append1[1] << 16; + w1[3] = append1[1] >> 16 | append1[2] << 16; + w2[0] = append1[2] >> 16 | append1[3] << 16; + w2[1] = append1[3] >> 16 | append2[0] << 16; + w2[2] = append2[0] >> 16; break; - case 119: - w[29] = w[29] | 0x80000000; + case 7: + w0[1] = w0[1] | append0[0] << 24; + w0[2] = append0[0] >> 8 | append0[1] << 24; + w0[3] = append0[1] >> 8 | append0[2] << 24; + w1[0] = append0[2] >> 8 | append0[3] << 24; + w1[1] = append0[3] >> 8 | append1[0] << 24; + w1[2] = append1[0] >> 8 | append1[1] << 24; + w1[3] = append1[1] >> 8 | append1[2] << 24; + w2[0] = append1[2] >> 8 | append1[3] << 24; + w2[1] = append1[3] >> 8 | append2[0] << 24; + w2[2] = append2[0] >> 8; break; - case 120: - w[30] = 0x80; + case 8: + w0[2] = append0[0]; + w0[3] = append0[1]; + w1[0] = append0[2]; + w1[1] = append0[3]; + w1[2] = append1[0]; + w1[3] = append1[1]; + w2[0] = append1[2]; + w2[1] = append1[3]; + w2[2] = append2[0]; break; - case 121: - w[30] = w[30] | 0x8000; + case 9: + w0[2] = w0[2] | append0[0] << 8; + w0[3] = append0[0] >> 24 | append0[1] << 8; + w1[0] = append0[1] >> 24 | append0[2] << 8; + w1[1] = append0[2] >> 24 | append0[3] << 8; + w1[2] = append0[3] >> 24 | append1[0] << 8; + w1[3] = append1[0] >> 24 | append1[1] << 8; + w2[0] = append1[1] >> 24 | append1[2] << 8; + w2[1] = append1[2] >> 24 | append1[3] << 8; + w2[2] = append1[3] >> 24 | append2[0] << 8; + w2[3] = append2[0] >> 24; break; - case 122: - w[30] = w[30] | 0x800000; + case 10: + w0[2] = w0[2] | append0[0] << 16; + w0[3] = append0[0] >> 16 | append0[1] << 16; + w1[0] = append0[1] >> 16 | append0[2] << 16; + w1[1] = append0[2] >> 16 | append0[3] << 16; + w1[2] = append0[3] >> 16 | append1[0] << 16; + w1[3] = append1[0] >> 16 | append1[1] << 16; + w2[0] = append1[1] >> 16 | append1[2] << 16; + w2[1] = append1[2] >> 16 | append1[3] << 16; + w2[2] = append1[3] >> 16 | append2[0] << 16; + w2[3] = append2[0] >> 16; break; - case 123: - w[30] = w[30] | 0x80000000; + case 11: + w0[2] = w0[2] | append0[0] << 24; + w0[3] = append0[0] >> 8 | append0[1] << 24; + w1[0] = append0[1] >> 8 | append0[2] << 24; + w1[1] = append0[2] >> 8 | append0[3] << 24; + w1[2] = append0[3] >> 8 | append1[0] << 24; + w1[3] = append1[0] >> 8 | append1[1] << 24; + w2[0] = append1[1] >> 8 | append1[2] << 24; + w2[1] = append1[2] >> 8 | append1[3] << 24; + w2[2] = append1[3] >> 8 | append2[0] << 24; + w2[3] = append2[0] >> 8; break; - case 124: - w[31] = 0x80; + case 12: + w0[3] = append0[0]; + w1[0] = append0[1]; + w1[1] = append0[2]; + w1[2] = append0[3]; + w1[3] = append1[0]; + w2[0] = append1[1]; + w2[1] = append1[2]; + w2[2] = append1[3]; + w2[3] = append2[0]; break; - case 125: - w[31] = w[31] | 0x8000; + case 13: + w0[3] = w0[3] | append0[0] << 8; + w1[0] = append0[0] >> 24 | append0[1] << 8; + w1[1] = append0[1] >> 24 | append0[2] << 8; + w1[2] = append0[2] >> 24 | append0[3] << 8; + w1[3] = append0[3] >> 24 | append1[0] << 8; + w2[0] = append1[0] >> 24 | append1[1] << 8; + w2[1] = append1[1] >> 24 | append1[2] << 8; + w2[2] = append1[2] >> 24 | append1[3] << 8; + w2[3] = append1[3] >> 24 | append2[0] << 8; + w3[0] = append2[0] >> 24; break; - case 126: - w[31] = w[31] | 0x800000; + case 14: + w0[3] = w0[3] | append0[0] << 16; + w1[0] = append0[0] >> 16 | append0[1] << 16; + w1[1] = append0[1] >> 16 | append0[2] << 16; + w1[2] = append0[2] >> 16 | append0[3] << 16; + w1[3] = append0[3] >> 16 | append1[0] << 16; + w2[0] = append1[0] >> 16 | append1[1] << 16; + w2[1] = append1[1] >> 16 | append1[2] << 16; + w2[2] = append1[2] >> 16 | append1[3] << 16; + w2[3] = append1[3] >> 16 | append2[0] << 16; + w3[0] = append2[0] >> 16; break; - case 127: - w[31] = w[31] | 0x80000000; + case 15: + w0[3] = w0[3] | append0[0] << 24; + w1[0] = append0[0] >> 8 | append0[1] << 24; + w1[1] = append0[1] >> 8 | append0[2] << 24; + w1[2] = append0[2] >> 8 | append0[3] << 24; + w1[3] = append0[3] >> 8 | append1[0] << 24; + w2[0] = append1[0] >> 8 | append1[1] << 24; + w2[1] = append1[1] >> 8 | append1[2] << 24; + w2[2] = append1[2] >> 8 | append1[3] << 24; + w2[3] = append1[3] >> 8 | append2[0] << 24; + w3[0] = append2[0] >> 8; break; } } -// before: device_memcat2L -static void memcat_c7_d1x2_sl1x2_sr1x2 (const u32 offset, u32 dst0[2], u32 src_l0[2], u32 src_r0[2]) +// before: memcat32_8 +static void memcat_c32_w4x4_a2x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 append0[4], const u32 append1[4], const u32 offset) { switch (offset) { + case 0: + w0[0] = append0[0]; + w0[1] = append0[1]; + w0[2] = append0[2]; + w0[3] = append0[3]; + w1[0] = append1[0]; + w1[1] = append1[1]; + w1[2] = append1[2]; + w1[3] = append1[3]; + break; + case 1: - dst0[0] = src_l0[0] | src_r0[0] << 8; - dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8; + w0[0] = w0[0] | append0[0] << 8; + w0[1] = append0[0] >> 24 | append0[1] << 8; + w0[2] = append0[1] >> 24 | append0[2] << 8; + w0[3] = append0[2] >> 24 | append0[3] << 8; + w1[0] = append0[3] >> 24 | append1[0] << 8; + w1[1] = append1[0] >> 24 | append1[1] << 8; + w1[2] = append1[1] >> 24 | append1[2] << 8; + w1[3] = append1[2] >> 24 | append1[3] << 8; + w2[0] = append1[3] >> 24; break; case 2: - dst0[0] = src_l0[0] | src_r0[0] << 16; - dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16; + w0[0] = w0[0] | append0[0] << 16; + w0[1] = append0[0] >> 16 | append0[1] << 16; + w0[2] = append0[1] >> 16 | append0[2] << 16; + w0[3] = append0[2] >> 16 | append0[3] << 16; + w1[0] = append0[3] >> 16 | append1[0] << 16; + w1[1] = append1[0] >> 16 | append1[1] << 16; + w1[2] = append1[1] >> 16 | append1[2] << 16; + w1[3] = append1[2] >> 16 | append1[3] << 16; + w2[0] = append1[3] >> 16; break; case 3: - dst0[0] = src_l0[0] | src_r0[0] << 24; - dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24; + w0[0] = w0[0] | append0[0] << 24; + w0[1] = append0[0] >> 8 | append0[1] << 24; + w0[2] = append0[1] >> 8 | append0[2] << 24; + w0[3] = append0[2] >> 8 | append0[3] << 24; + w1[0] = append0[3] >> 8 | append1[0] << 24; + w1[1] = append1[0] >> 8 | append1[1] << 24; + w1[2] = append1[1] >> 8 | append1[2] << 24; + w1[3] = append1[2] >> 8 | append1[3] << 24; + w2[0] = append1[3] >> 8; break; case 4: - dst0[1] = src_r0[0]; + w0[1] = append0[0]; + w0[2] = append0[1]; + w0[3] = append0[2]; + w1[0] = append0[3]; + w1[1] = append1[0]; + w1[2] = append1[1]; + w1[3] = append1[2]; + w2[0] = append1[3]; break; case 5: - dst0[1] = src_l0[1] | src_r0[0] << 8; - break; - - case 6: - dst0[1] = src_l0[1] | src_r0[0] << 16; - break; - - case 7: - dst0[1] = src_l0[1] | src_r0[0] << 24; - break; - } -} - -// before: device_memcat4L -static void memcat_c15_d1x4_sl1x4_sr1x4 (const u32 offset, u32 dst0[4], u32 src_l0[4], u32 src_r0[4]) -{ - switch (offset) - { - case 1: - dst0[0] = src_l0[0] | src_r0[0] << 8; - dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8; - dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8; - dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8; - break; - - case 2: - dst0[0] = src_l0[0] | src_r0[0] << 16; - dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16; - dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16; - dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16; - break; - - case 3: - dst0[0] = src_l0[0] | src_r0[0] << 24; - dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24; - dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24; - dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24; - break; - - case 4: - dst0[1] = src_r0[0]; - dst0[2] = src_r0[1]; - dst0[3] = src_r0[2]; - break; - - case 5: - dst0[1] = src_l0[1] | src_r0[0] << 8; - dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8; - dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8; + w0[1] = w0[1] | append0[0] << 8; + w0[2] = append0[0] >> 24 | append0[1] << 8; + w0[3] = append0[1] >> 24 | append0[2] << 8; + w1[0] = append0[2] >> 24 | append0[3] << 8; + w1[1] = append0[3] >> 24 | append1[0] << 8; + w1[2] = append1[0] >> 24 | append1[1] << 8; + w1[3] = append1[1] >> 24 | append1[2] << 8; + w2[0] = append1[2] >> 24 | append1[3] << 8; + w2[1] = append1[3] >> 24; break; case 6: - dst0[1] = src_l0[1] | src_r0[0] << 16; - dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16; - dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16; + w0[1] = w0[1] | append0[0] << 16; + w0[2] = append0[0] >> 16 | append0[1] << 16; + w0[3] = append0[1] >> 16 | append0[2] << 16; + w1[0] = append0[2] >> 16 | append0[3] << 16; + w1[1] = append0[3] >> 16 | append1[0] << 16; + w1[2] = append1[0] >> 16 | append1[1] << 16; + w1[3] = append1[1] >> 16 | append1[2] << 16; + w2[0] = append1[2] >> 16 | append1[3] << 16; + w2[1] = append1[3] >> 16; break; case 7: - dst0[1] = src_l0[1] | src_r0[0] << 24; - dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24; - dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24; + w0[1] = w0[1] | append0[0] << 24; + w0[2] = append0[0] >> 8 | append0[1] << 24; + w0[3] = append0[1] >> 8 | append0[2] << 24; + w1[0] = append0[2] >> 8 | append0[3] << 24; + w1[1] = append0[3] >> 8 | append1[0] << 24; + w1[2] = append1[0] >> 8 | append1[1] << 24; + w1[3] = append1[1] >> 8 | append1[2] << 24; + w2[0] = append1[2] >> 8 | append1[3] << 24; + w2[1] = append1[3] >> 8; break; case 8: - dst0[2] = src_r0[0]; - dst0[3] = src_r0[1]; + w0[2] = append0[0]; + w0[3] = append0[1]; + w1[0] = append0[2]; + w1[1] = append0[3]; + w1[2] = append1[0]; + w1[3] = append1[1]; + w2[0] = append1[2]; + w2[1] = append1[3]; break; case 9: - dst0[2] = src_l0[2] | src_r0[0] << 8; - dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8; + w0[2] = w0[2] | append0[0] << 8; + w0[3] = append0[0] >> 24 | append0[1] << 8; + w1[0] = append0[1] >> 24 | append0[2] << 8; + w1[1] = append0[2] >> 24 | append0[3] << 8; + w1[2] = append0[3] >> 24 | append1[0] << 8; + w1[3] = append1[0] >> 24 | append1[1] << 8; + w2[0] = append1[1] >> 24 | append1[2] << 8; + w2[1] = append1[2] >> 24 | append1[3] << 8; + w2[2] = append1[3] >> 24; break; case 10: - dst0[2] = src_l0[2] | src_r0[0] << 16; - dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16; + w0[2] = w0[2] | append0[0] << 16; + w0[3] = append0[0] >> 16 | append0[1] << 16; + w1[0] = append0[1] >> 16 | append0[2] << 16; + w1[1] = append0[2] >> 16 | append0[3] << 16; + w1[2] = append0[3] >> 16 | append1[0] << 16; + w1[3] = append1[0] >> 16 | append1[1] << 16; + w2[0] = append1[1] >> 16 | append1[2] << 16; + w2[1] = append1[2] >> 16 | append1[3] << 16; + w2[2] = append1[3] >> 16; break; case 11: - dst0[2] = src_l0[2] | src_r0[0] << 24; - dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24; + w0[2] = w0[2] | append0[0] << 24; + w0[3] = append0[0] >> 8 | append0[1] << 24; + w1[0] = append0[1] >> 8 | append0[2] << 24; + w1[1] = append0[2] >> 8 | append0[3] << 24; + w1[2] = append0[3] >> 8 | append1[0] << 24; + w1[3] = append1[0] >> 8 | append1[1] << 24; + w2[0] = append1[1] >> 8 | append1[2] << 24; + w2[1] = append1[2] >> 8 | append1[3] << 24; + w2[2] = append1[3] >> 8; break; case 12: - dst0[3] = src_r0[0]; + w0[3] = append0[0]; + w1[0] = append0[1]; + w1[1] = append0[2]; + w1[2] = append0[3]; + w1[3] = append1[0]; + w2[0] = append1[1]; + w2[1] = append1[2]; + w2[2] = append1[3]; break; case 13: - dst0[3] = src_l0[3] | src_r0[0] << 8; + w0[3] = w0[3] | append0[0] << 8; + w1[0] = append0[0] >> 24 | append0[1] << 8; + w1[1] = append0[1] >> 24 | append0[2] << 8; + w1[2] = append0[2] >> 24 | append0[3] << 8; + w1[3] = append0[3] >> 24 | append1[0] << 8; + w2[0] = append1[0] >> 24 | append1[1] << 8; + w2[1] = append1[1] >> 24 | append1[2] << 8; + w2[2] = append1[2] >> 24 | append1[3] << 8; + w2[3] = append1[3] >> 24; break; case 14: - dst0[3] = src_l0[3] | src_r0[0] << 16; + w0[3] = w0[3] | append0[0] << 16; + w1[0] = append0[0] >> 16 | append0[1] << 16; + w1[1] = append0[1] >> 16 | append0[2] << 16; + w1[2] = append0[2] >> 16 | append0[3] << 16; + w1[3] = append0[3] >> 16 | append1[0] << 16; + w2[0] = append1[0] >> 16 | append1[1] << 16; + w2[1] = append1[1] >> 16 | append1[2] << 16; + w2[2] = append1[2] >> 16 | append1[3] << 16; + w2[3] = append1[3] >> 16; break; case 15: - dst0[3] = src_l0[3] | src_r0[0] << 24; - break; - } -} - -// before: device_memcat8L -static void memcat_c31_d2x4_sl2x4_sr1x4 (const u32 offset, u32 dst0[4], u32 dst1[4], u32 src_l0[4], u32 src_l1[4], u32 src_r0[4]) -{ - switch (offset) - { - case 1: - dst0[0] = src_l0[0] | src_r0[0] << 8; - dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8; - dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8; - dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[0] = src_r0[3] >> 24; + w0[3] = w0[3] | append0[0] << 24; + w1[0] = append0[0] >> 8 | append0[1] << 24; + w1[1] = append0[1] >> 8 | append0[2] << 24; + w1[2] = append0[2] >> 8 | append0[3] << 24; + w1[3] = append0[3] >> 8 | append1[0] << 24; + w2[0] = append1[0] >> 8 | append1[1] << 24; + w2[1] = append1[1] >> 8 | append1[2] << 24; + w2[2] = append1[2] >> 8 | append1[3] << 24; + w2[3] = append1[3] >> 8; break; - case 2: - dst0[0] = src_l0[0] | src_r0[0] << 16; - dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16; - dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16; - dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[0] = src_r0[3] >> 16; + case 16: + w1[0] = append0[0]; + w1[1] = append0[1]; + w1[2] = append0[2]; + w1[3] = append0[3]; + w2[0] = append1[0]; + w2[1] = append1[1]; + w2[2] = append1[2]; + w2[3] = append1[3]; break; - case 3: - dst0[0] = src_l0[0] | src_r0[0] << 24; - dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24; - dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24; - dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[0] = src_r0[3] >> 8; + case 17: + w1[0] = w1[0] | append0[0] << 8; + w1[1] = append0[0] >> 24 | append0[1] << 8; + w1[2] = append0[1] >> 24 | append0[2] << 8; + w1[3] = append0[2] >> 24 | append0[3] << 8; + w2[0] = append0[3] >> 24 | append1[0] << 8; + w2[1] = append1[0] >> 24 | append1[1] << 8; + w2[2] = append1[1] >> 24 | append1[2] << 8; + w2[3] = append1[2] >> 24 | append1[3] << 8; + w3[0] = append1[3] >> 24; break; - case 4: - dst0[1] = src_r0[0]; - dst0[2] = src_r0[1]; - dst0[3] = src_r0[2]; - dst1[0] = src_r0[3]; + case 18: + w1[0] = w1[0] | append0[0] << 16; + w1[1] = append0[0] >> 16 | append0[1] << 16; + w1[2] = append0[1] >> 16 | append0[2] << 16; + w1[3] = append0[2] >> 16 | append0[3] << 16; + w2[0] = append0[3] >> 16 | append1[0] << 16; + w2[1] = append1[0] >> 16 | append1[1] << 16; + w2[2] = append1[1] >> 16 | append1[2] << 16; + w2[3] = append1[2] >> 16 | append1[3] << 16; + w3[0] = append1[3] >> 16; break; - case 5: - dst0[1] = src_l0[1] | src_r0[0] << 8; - dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8; - dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[1] = src_r0[3] >> 24; + case 19: + w1[0] = w1[0] | append0[0] << 24; + w1[1] = append0[0] >> 8 | append0[1] << 24; + w1[2] = append0[1] >> 8 | append0[2] << 24; + w1[3] = append0[2] >> 8 | append0[3] << 24; + w2[0] = append0[3] >> 8 | append1[0] << 24; + w2[1] = append1[0] >> 8 | append1[1] << 24; + w2[2] = append1[1] >> 8 | append1[2] << 24; + w2[3] = append1[2] >> 8 | append1[3] << 24; + w3[0] = append1[3] >> 8; break; - case 6: - dst0[1] = src_l0[1] | src_r0[0] << 16; - dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16; - dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[1] = src_r0[3] >> 16; - break; - - case 7: - dst0[1] = src_l0[1] | src_r0[0] << 24; - dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24; - dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[1] = src_r0[3] >> 8; - break; - - case 8: - dst0[2] = src_r0[0]; - dst0[3] = src_r0[1]; - dst1[0] = src_r0[2]; - dst1[1] = src_r0[3]; - break; - - case 9: - dst0[2] = src_l0[2] | src_r0[0] << 8; - dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[2] = src_r0[3] >> 24; - break; - - case 10: - dst0[2] = src_l0[2] | src_r0[0] << 16; - dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[2] = src_r0[3] >> 16; - break; - - case 11: - dst0[2] = src_l0[2] | src_r0[0] << 24; - dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[2] = src_r0[3] >> 8; - break; - - case 12: - dst0[3] = src_r0[0]; - dst1[0] = src_r0[1]; - dst1[1] = src_r0[2]; - dst1[2] = src_r0[3]; - break; - - case 13: - dst0[3] = src_l0[3] | src_r0[0] << 8; - dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[3] = src_r0[3] >> 24; - break; - - case 14: - dst0[3] = src_l0[3] | src_r0[0] << 16; - dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[3] = src_r0[3] >> 16; - break; - - case 15: - dst0[3] = src_l0[3] | src_r0[0] << 24; - dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[3] = src_r0[3] >> 8; - break; - - case 16: - dst1[0] = src_r0[0]; - dst1[1] = src_r0[1]; - dst1[2] = src_r0[2]; - dst1[3] = src_r0[3]; - break; - - case 17: - dst1[0] = src_l1[0] | src_r0[0] << 8; - dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8; - break; - - case 18: - dst1[0] = src_l1[0] | src_r0[0] << 16; - dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16; - break; - - case 19: - dst1[0] = src_l1[0] | src_r0[0] << 24; - dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24; - break; - - case 20: - dst1[1] = src_r0[0]; - dst1[2] = src_r0[1]; - dst1[3] = src_r0[2]; + case 20: + w1[1] = append0[0]; + w1[2] = append0[1]; + w1[3] = append0[2]; + w2[0] = append0[3]; + w2[1] = append1[0]; + w2[2] = append1[1]; + w2[3] = append1[2]; + w3[0] = append1[3]; break; case 21: - dst1[1] = src_l1[1] | src_r0[0] << 8; - dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8; + w1[1] = w1[1] | append0[0] << 8; + w1[2] = append0[0] >> 24 | append0[1] << 8; + w1[3] = append0[1] >> 24 | append0[2] << 8; + w2[0] = append0[2] >> 24 | append0[3] << 8; + w2[1] = append0[3] >> 24 | append1[0] << 8; + w2[2] = append1[0] >> 24 | append1[1] << 8; + w2[3] = append1[1] >> 24 | append1[2] << 8; + w3[0] = append1[2] >> 24 | append1[3] << 8; + w3[1] = append1[3] >> 24; break; case 22: - dst1[1] = src_l1[1] | src_r0[0] << 16; - dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16; + w1[1] = w1[1] | append0[0] << 16; + w1[2] = append0[0] >> 16 | append0[1] << 16; + w1[3] = append0[1] >> 16 | append0[2] << 16; + w2[0] = append0[2] >> 16 | append0[3] << 16; + w2[1] = append0[3] >> 16 | append1[0] << 16; + w2[2] = append1[0] >> 16 | append1[1] << 16; + w2[3] = append1[1] >> 16 | append1[2] << 16; + w3[0] = append1[2] >> 16 | append1[3] << 16; + w3[1] = append1[3] >> 16; break; case 23: - dst1[1] = src_l1[1] | src_r0[0] << 24; - dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24; + w1[1] = w1[1] | append0[0] << 24; + w1[2] = append0[0] >> 8 | append0[1] << 24; + w1[3] = append0[1] >> 8 | append0[2] << 24; + w2[0] = append0[2] >> 8 | append0[3] << 24; + w2[1] = append0[3] >> 8 | append1[0] << 24; + w2[2] = append1[0] >> 8 | append1[1] << 24; + w2[3] = append1[1] >> 8 | append1[2] << 24; + w3[0] = append1[2] >> 8 | append1[3] << 24; + w3[1] = append1[3] >> 8; break; case 24: - dst1[2] = src_r0[0]; - dst1[3] = src_r0[1]; + w1[2] = append0[0]; + w1[3] = append0[1]; + w2[0] = append0[2]; + w2[1] = append0[3]; + w2[2] = append1[0]; + w2[3] = append1[1]; + w3[0] = append1[2]; + w3[1] = append1[3]; break; case 25: - dst1[2] = src_l1[2] | src_r0[0] << 8; - dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8; + w1[2] = w1[2] | append0[0] << 8; + w1[3] = append0[0] >> 24 | append0[1] << 8; + w2[0] = append0[1] >> 24 | append0[2] << 8; + w2[1] = append0[2] >> 24 | append0[3] << 8; + w2[2] = append0[3] >> 24 | append1[0] << 8; + w2[3] = append1[0] >> 24 | append1[1] << 8; + w3[0] = append1[1] >> 24 | append1[2] << 8; + w3[1] = append1[2] >> 24 | append1[3] << 8; break; case 26: - dst1[2] = src_l1[2] | src_r0[0] << 16; - dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16; + w1[2] = w1[2] | append0[0] << 16; + w1[3] = append0[0] >> 16 | append0[1] << 16; + w2[0] = append0[1] >> 16 | append0[2] << 16; + w2[1] = append0[2] >> 16 | append0[3] << 16; + w2[2] = append0[3] >> 16 | append1[0] << 16; + w2[3] = append1[0] >> 16 | append1[1] << 16; + w3[0] = append1[1] >> 16 | append1[2] << 16; + w3[1] = append1[2] >> 16 | append1[3] << 16; break; case 27: - dst1[2] = src_l1[2] | src_r0[0] << 24; - dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24; + w1[2] = w1[2] | append0[0] << 24; + w1[3] = append0[0] >> 8 | append0[1] << 24; + w2[0] = append0[1] >> 8 | append0[2] << 24; + w2[1] = append0[2] >> 8 | append0[3] << 24; + w2[2] = append0[3] >> 8 | append1[0] << 24; + w2[3] = append1[0] >> 8 | append1[1] << 24; + w3[0] = append1[1] >> 8 | append1[2] << 24; + w3[1] = append1[2] >> 8 | append1[3] << 24; break; case 28: - dst1[3] = src_r0[0]; + w1[3] = append0[0]; + w2[0] = append0[1]; + w2[1] = append0[2]; + w2[2] = append0[3]; + w2[3] = append1[0]; + w3[0] = append1[1]; + w3[1] = append1[2]; break; case 29: - dst1[3] = src_l1[3] | src_r0[0] << 8; + w1[3] = w1[3] | append0[0] << 8; + w2[0] = append0[0] >> 24 | append0[1] << 8; + w2[1] = append0[1] >> 24 | append0[2] << 8; + w2[2] = append0[2] >> 24 | append0[3] << 8; + w2[3] = append0[3] >> 24 | append1[0] << 8; + w3[0] = append1[0] >> 24 | append1[1] << 8; + w3[1] = append1[1] >> 24 | append1[2] << 8; break; case 30: - dst1[3] = src_l1[3] | src_r0[0] << 16; + w1[3] = w1[3] | append0[0] << 16; + w2[0] = append0[0] >> 16 | append0[1] << 16; + w2[1] = append0[1] >> 16 | append0[2] << 16; + w2[2] = append0[2] >> 16 | append0[3] << 16; + w2[3] = append0[3] >> 16 | append1[0] << 16; + w3[0] = append1[0] >> 16 | append1[1] << 16; + w3[1] = append1[1] >> 16 | append1[2] << 16; break; case 31: - dst1[3] = src_l1[3] | src_r0[0] << 24; + w1[3] = w1[3] | append0[0] << 24; + w2[0] = append0[0] >> 8 | append0[1] << 24; + w2[1] = append0[1] >> 8 | append0[2] << 24; + w2[2] = append0[2] >> 8 | append0[3] << 24; + w2[3] = append0[3] >> 8 | append1[0] << 24; + w3[0] = append1[0] >> 8 | append1[1] << 24; + w3[1] = append1[1] >> 8 | append1[2] << 24; + break; + + case 32: + w2[0] = append0[0]; + w2[1] = append0[1]; + w2[2] = append0[2]; + w2[3] = append0[3]; + w3[0] = append1[0]; + w3[1] = append1[1]; break; } } -// before: device_memcat12L -static void memcat_c47_d3x4_sl3x4_sr1x4 (const u32 offset, u32 dst0[4], u32 dst1[4], u32 dst2[4], u32 src_l0[4], u32 src_l1[4], u32 src_l2[4], u32 src_r0[4]) +// before: memcat32_9 +static void memcat_c32_w4x4_a3x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 append0[4], const u32 append1[4], const u32 append2[4], const u32 offset) { switch (offset) { - case 1: - dst0[0] = src_l0[0] | src_r0[0] << 8; - dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8; - dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8; - dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[0] = src_r0[3] >> 24; + case 0: + w0[0] = append0[0]; + w0[1] = append0[1]; + w0[2] = append0[2]; + w0[3] = append0[3]; + w1[0] = append1[0]; + w1[1] = append1[1]; + w1[2] = append1[2]; + w1[3] = append1[3]; + w2[0] = append2[0]; break; - case 2: - dst0[0] = src_l0[0] | src_r0[0] << 16; - dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16; - dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16; - dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[0] = src_r0[3] >> 16; + case 1: + w0[0] = w0[0] | append0[0] << 8; + w0[1] = append0[0] >> 24 | append0[1] << 8; + w0[2] = append0[1] >> 24 | append0[2] << 8; + w0[3] = append0[2] >> 24 | append0[3] << 8; + w1[0] = append0[3] >> 24 | append1[0] << 8; + w1[1] = append1[0] >> 24 | append1[1] << 8; + w1[2] = append1[1] >> 24 | append1[2] << 8; + w1[3] = append1[2] >> 24 | append1[3] << 8; + w2[0] = append1[3] >> 24 | append2[0] << 8; + w2[1] = append2[0] >> 24; break; - case 3: - dst0[0] = src_l0[0] | src_r0[0] << 24; - dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24; - dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24; - dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[0] = src_r0[3] >> 8; + case 2: + w0[0] = w0[0] | append0[0] << 16; + w0[1] = append0[0] >> 16 | append0[1] << 16; + w0[2] = append0[1] >> 16 | append0[2] << 16; + w0[3] = append0[2] >> 16 | append0[3] << 16; + w1[0] = append0[3] >> 16 | append1[0] << 16; + w1[1] = append1[0] >> 16 | append1[1] << 16; + w1[2] = append1[1] >> 16 | append1[2] << 16; + w1[3] = append1[2] >> 16 | append1[3] << 16; + w2[0] = append1[3] >> 16 | append2[0] << 16; + w2[1] = append2[0] >> 16; + break; + + case 3: + w0[0] = w0[0] | append0[0] << 24; + w0[1] = append0[0] >> 8 | append0[1] << 24; + w0[2] = append0[1] >> 8 | append0[2] << 24; + w0[3] = append0[2] >> 8 | append0[3] << 24; + w1[0] = append0[3] >> 8 | append1[0] << 24; + w1[1] = append1[0] >> 8 | append1[1] << 24; + w1[2] = append1[1] >> 8 | append1[2] << 24; + w1[3] = append1[2] >> 8 | append1[3] << 24; + w2[0] = append1[3] >> 8 | append2[0] << 24; + w2[1] = append2[0] >> 8; break; case 4: - dst0[1] = src_r0[0]; - dst0[2] = src_r0[1]; - dst0[3] = src_r0[2]; - dst1[0] = src_r0[3]; + w0[1] = append0[0]; + w0[2] = append0[1]; + w0[3] = append0[2]; + w1[0] = append0[3]; + w1[1] = append1[0]; + w1[2] = append1[1]; + w1[3] = append1[2]; + w2[0] = append1[3]; + w2[1] = append2[0]; break; case 5: - dst0[1] = src_l0[1] | src_r0[0] << 8; - dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8; - dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[1] = src_r0[3] >> 24; + w0[1] = w0[1] | append0[0] << 8; + w0[2] = append0[0] >> 24 | append0[1] << 8; + w0[3] = append0[1] >> 24 | append0[2] << 8; + w1[0] = append0[2] >> 24 | append0[3] << 8; + w1[1] = append0[3] >> 24 | append1[0] << 8; + w1[2] = append1[0] >> 24 | append1[1] << 8; + w1[3] = append1[1] >> 24 | append1[2] << 8; + w2[0] = append1[2] >> 24 | append1[3] << 8; + w2[1] = append1[3] >> 24 | append2[0] << 8; + w2[2] = append2[0] >> 24; break; case 6: - dst0[1] = src_l0[1] | src_r0[0] << 16; - dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16; - dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[1] = src_r0[3] >> 16; + w0[1] = w0[1] | append0[0] << 16; + w0[2] = append0[0] >> 16 | append0[1] << 16; + w0[3] = append0[1] >> 16 | append0[2] << 16; + w1[0] = append0[2] >> 16 | append0[3] << 16; + w1[1] = append0[3] >> 16 | append1[0] << 16; + w1[2] = append1[0] >> 16 | append1[1] << 16; + w1[3] = append1[1] >> 16 | append1[2] << 16; + w2[0] = append1[2] >> 16 | append1[3] << 16; + w2[1] = append1[3] >> 16 | append2[0] << 16; + w2[2] = append2[0] >> 16; break; case 7: - dst0[1] = src_l0[1] | src_r0[0] << 24; - dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24; - dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[1] = src_r0[3] >> 8; + w0[1] = w0[1] | append0[0] << 24; + w0[2] = append0[0] >> 8 | append0[1] << 24; + w0[3] = append0[1] >> 8 | append0[2] << 24; + w1[0] = append0[2] >> 8 | append0[3] << 24; + w1[1] = append0[3] >> 8 | append1[0] << 24; + w1[2] = append1[0] >> 8 | append1[1] << 24; + w1[3] = append1[1] >> 8 | append1[2] << 24; + w2[0] = append1[2] >> 8 | append1[3] << 24; + w2[1] = append1[3] >> 8 | append2[0] << 24; + w2[2] = append2[0] >> 8; break; case 8: - dst0[2] = src_r0[0]; - dst0[3] = src_r0[1]; - dst1[0] = src_r0[2]; - dst1[1] = src_r0[3]; + w0[2] = append0[0]; + w0[3] = append0[1]; + w1[0] = append0[2]; + w1[1] = append0[3]; + w1[2] = append1[0]; + w1[3] = append1[1]; + w2[0] = append1[2]; + w2[1] = append1[3]; + w2[2] = append2[0]; break; case 9: - dst0[2] = src_l0[2] | src_r0[0] << 8; - dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[2] = src_r0[3] >> 24; + w0[2] = w0[2] | append0[0] << 8; + w0[3] = append0[0] >> 24 | append0[1] << 8; + w1[0] = append0[1] >> 24 | append0[2] << 8; + w1[1] = append0[2] >> 24 | append0[3] << 8; + w1[2] = append0[3] >> 24 | append1[0] << 8; + w1[3] = append1[0] >> 24 | append1[1] << 8; + w2[0] = append1[1] >> 24 | append1[2] << 8; + w2[1] = append1[2] >> 24 | append1[3] << 8; + w2[2] = append1[3] >> 24 | append2[0] << 8; + w2[3] = append2[0] >> 24; break; case 10: - dst0[2] = src_l0[2] | src_r0[0] << 16; - dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[2] = src_r0[3] >> 16; + w0[2] = w0[2] | append0[0] << 16; + w0[3] = append0[0] >> 16 | append0[1] << 16; + w1[0] = append0[1] >> 16 | append0[2] << 16; + w1[1] = append0[2] >> 16 | append0[3] << 16; + w1[2] = append0[3] >> 16 | append1[0] << 16; + w1[3] = append1[0] >> 16 | append1[1] << 16; + w2[0] = append1[1] >> 16 | append1[2] << 16; + w2[1] = append1[2] >> 16 | append1[3] << 16; + w2[2] = append1[3] >> 16 | append2[0] << 16; + w2[3] = append2[0] >> 16; break; case 11: - dst0[2] = src_l0[2] | src_r0[0] << 24; - dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[2] = src_r0[3] >> 8; + w0[2] = w0[2] | append0[0] << 24; + w0[3] = append0[0] >> 8 | append0[1] << 24; + w1[0] = append0[1] >> 8 | append0[2] << 24; + w1[1] = append0[2] >> 8 | append0[3] << 24; + w1[2] = append0[3] >> 8 | append1[0] << 24; + w1[3] = append1[0] >> 8 | append1[1] << 24; + w2[0] = append1[1] >> 8 | append1[2] << 24; + w2[1] = append1[2] >> 8 | append1[3] << 24; + w2[2] = append1[3] >> 8 | append2[0] << 24; + w2[3] = append2[0] >> 8; break; case 12: - dst0[3] = src_r0[0]; - dst1[0] = src_r0[1]; - dst1[1] = src_r0[2]; - dst1[2] = src_r0[3]; + w0[3] = append0[0]; + w1[0] = append0[1]; + w1[1] = append0[2]; + w1[2] = append0[3]; + w1[3] = append1[0]; + w2[0] = append1[1]; + w2[1] = append1[2]; + w2[2] = append1[3]; + w2[3] = append2[0]; break; case 13: - dst0[3] = src_l0[3] | src_r0[0] << 8; - dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[3] = src_r0[3] >> 24; + w0[3] = w0[3] | append0[0] << 8; + w1[0] = append0[0] >> 24 | append0[1] << 8; + w1[1] = append0[1] >> 24 | append0[2] << 8; + w1[2] = append0[2] >> 24 | append0[3] << 8; + w1[3] = append0[3] >> 24 | append1[0] << 8; + w2[0] = append1[0] >> 24 | append1[1] << 8; + w2[1] = append1[1] >> 24 | append1[2] << 8; + w2[2] = append1[2] >> 24 | append1[3] << 8; + w2[3] = append1[3] >> 24 | append2[0] << 8; + w3[0] = append2[0] >> 24; break; case 14: - dst0[3] = src_l0[3] | src_r0[0] << 16; - dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[3] = src_r0[3] >> 16; + w0[3] = w0[3] | append0[0] << 16; + w1[0] = append0[0] >> 16 | append0[1] << 16; + w1[1] = append0[1] >> 16 | append0[2] << 16; + w1[2] = append0[2] >> 16 | append0[3] << 16; + w1[3] = append0[3] >> 16 | append1[0] << 16; + w2[0] = append1[0] >> 16 | append1[1] << 16; + w2[1] = append1[1] >> 16 | append1[2] << 16; + w2[2] = append1[2] >> 16 | append1[3] << 16; + w2[3] = append1[3] >> 16 | append2[0] << 16; + w3[0] = append2[0] >> 16; break; case 15: - dst0[3] = src_l0[3] | src_r0[0] << 24; - dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[3] = src_r0[3] >> 8; + w0[3] = w0[3] | append0[0] << 24; + w1[0] = append0[0] >> 8 | append0[1] << 24; + w1[1] = append0[1] >> 8 | append0[2] << 24; + w1[2] = append0[2] >> 8 | append0[3] << 24; + w1[3] = append0[3] >> 8 | append1[0] << 24; + w2[0] = append1[0] >> 8 | append1[1] << 24; + w2[1] = append1[1] >> 8 | append1[2] << 24; + w2[2] = append1[2] >> 8 | append1[3] << 24; + w2[3] = append1[3] >> 8 | append2[0] << 24; + w3[0] = append2[0] >> 8; break; case 16: - dst1[0] = src_r0[0]; - dst1[1] = src_r0[1]; - dst1[2] = src_r0[2]; - dst1[3] = src_r0[3]; + w1[0] = append0[0]; + w1[1] = append0[1]; + w1[2] = append0[2]; + w1[3] = append0[3]; + w2[0] = append1[0]; + w2[1] = append1[1]; + w2[2] = append1[2]; + w2[3] = append1[3]; + w3[0] = append2[0]; break; case 17: - dst1[0] = src_l1[0] | src_r0[0] << 8; - dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8; - dst2[0] = src_r0[3] >> 24; + w1[0] = w1[0] | append0[0] << 8; + w1[1] = append0[0] >> 24 | append0[1] << 8; + w1[2] = append0[1] >> 24 | append0[2] << 8; + w1[3] = append0[2] >> 24 | append0[3] << 8; + w2[0] = append0[3] >> 24 | append1[0] << 8; + w2[1] = append1[0] >> 24 | append1[1] << 8; + w2[2] = append1[1] >> 24 | append1[2] << 8; + w2[3] = append1[2] >> 24 | append1[3] << 8; + w3[0] = append1[3] >> 24 | append2[0] << 8; + w3[1] = append2[0] >> 24; break; case 18: - dst1[0] = src_l1[0] | src_r0[0] << 16; - dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16; - dst2[0] = src_r0[3] >> 16; + w1[0] = w1[0] | append0[0] << 16; + w1[1] = append0[0] >> 16 | append0[1] << 16; + w1[2] = append0[1] >> 16 | append0[2] << 16; + w1[3] = append0[2] >> 16 | append0[3] << 16; + w2[0] = append0[3] >> 16 | append1[0] << 16; + w2[1] = append1[0] >> 16 | append1[1] << 16; + w2[2] = append1[1] >> 16 | append1[2] << 16; + w2[3] = append1[2] >> 16 | append1[3] << 16; + w3[0] = append1[3] >> 16 | append2[0] << 16; + w3[1] = append2[0] >> 16; break; case 19: - dst1[0] = src_l1[0] | src_r0[0] << 24; - dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24; - dst2[0] = src_r0[3] >> 8; - break; - - case 20: - dst1[1] = src_r0[0]; - dst1[2] = src_r0[1]; - dst1[3] = src_r0[2]; - dst2[0] = src_r0[3]; + w1[0] = w1[0] | append0[0] << 24; + w1[1] = append0[0] >> 8 | append0[1] << 24; + w1[2] = append0[1] >> 8 | append0[2] << 24; + w1[3] = append0[2] >> 8 | append0[3] << 24; + w2[0] = append0[3] >> 8 | append1[0] << 24; + w2[1] = append1[0] >> 8 | append1[1] << 24; + w2[2] = append1[1] >> 8 | append1[2] << 24; + w2[3] = append1[2] >> 8 | append1[3] << 24; + w3[0] = append1[3] >> 8 | append2[0] << 24; + w3[1] = append2[0] >> 8; + break; + + case 20: + w1[1] = append0[0]; + w1[2] = append0[1]; + w1[3] = append0[2]; + w2[0] = append0[3]; + w2[1] = append1[0]; + w2[2] = append1[1]; + w2[3] = append1[2]; + w3[0] = append1[3]; + w3[1] = append2[0]; break; case 21: - dst1[1] = src_l1[1] | src_r0[0] << 8; - dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8; - dst2[0] = src_r0[2] >> 24 | src_r0[3] << 8; - dst2[1] = src_r0[3] >> 24; + w1[1] = w1[1] | append0[0] << 8; + w1[2] = append0[0] >> 24 | append0[1] << 8; + w1[3] = append0[1] >> 24 | append0[2] << 8; + w2[0] = append0[2] >> 24 | append0[3] << 8; + w2[1] = append0[3] >> 24 | append1[0] << 8; + w2[2] = append1[0] >> 24 | append1[1] << 8; + w2[3] = append1[1] >> 24 | append1[2] << 8; + w3[0] = append1[2] >> 24 | append1[3] << 8; + w3[1] = append1[3] >> 24 | append2[0] << 8; break; case 22: - dst1[1] = src_l1[1] | src_r0[0] << 16; - dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16; - dst2[0] = src_r0[2] >> 16 | src_r0[3] << 16; - dst2[1] = src_r0[3] >> 16; + w1[1] = w1[1] | append0[0] << 16; + w1[2] = append0[0] >> 16 | append0[1] << 16; + w1[3] = append0[1] >> 16 | append0[2] << 16; + w2[0] = append0[2] >> 16 | append0[3] << 16; + w2[1] = append0[3] >> 16 | append1[0] << 16; + w2[2] = append1[0] >> 16 | append1[1] << 16; + w2[3] = append1[1] >> 16 | append1[2] << 16; + w3[0] = append1[2] >> 16 | append1[3] << 16; + w3[1] = append1[3] >> 16 | append2[0] << 16; break; case 23: - dst1[1] = src_l1[1] | src_r0[0] << 24; - dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24; - dst2[0] = src_r0[2] >> 8 | src_r0[3] << 24; - dst2[1] = src_r0[3] >> 8; + w1[1] = w1[1] | append0[0] << 24; + w1[2] = append0[0] >> 8 | append0[1] << 24; + w1[3] = append0[1] >> 8 | append0[2] << 24; + w2[0] = append0[2] >> 8 | append0[3] << 24; + w2[1] = append0[3] >> 8 | append1[0] << 24; + w2[2] = append1[0] >> 8 | append1[1] << 24; + w2[3] = append1[1] >> 8 | append1[2] << 24; + w3[0] = append1[2] >> 8 | append1[3] << 24; + w3[1] = append1[3] >> 8 | append2[0] << 24; break; case 24: - dst1[2] = src_r0[0]; - dst1[3] = src_r0[1]; - dst2[0] = src_r0[2]; - dst2[1] = src_r0[3]; + w1[2] = append0[0]; + w1[3] = append0[1]; + w2[0] = append0[2]; + w2[1] = append0[3]; + w2[2] = append1[0]; + w2[3] = append1[1]; + w3[0] = append1[2]; + w3[1] = append1[3]; break; case 25: - dst1[2] = src_l1[2] | src_r0[0] << 8; - dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8; - dst2[0] = src_r0[1] >> 24 | src_r0[2] << 8; - dst2[1] = src_r0[2] >> 24 | src_r0[3] << 8; - dst2[2] = src_r0[3] >> 24; + w1[2] = w1[2] | append0[0] << 8; + w1[3] = append0[0] >> 24 | append0[1] << 8; + w2[0] = append0[1] >> 24 | append0[2] << 8; + w2[1] = append0[2] >> 24 | append0[3] << 8; + w2[2] = append0[3] >> 24 | append1[0] << 8; + w2[3] = append1[0] >> 24 | append1[1] << 8; + w3[0] = append1[1] >> 24 | append1[2] << 8; + w3[1] = append1[2] >> 24 | append1[3] << 8; break; case 26: - dst1[2] = src_l1[2] | src_r0[0] << 16; - dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16; - dst2[0] = src_r0[1] >> 16 | src_r0[2] << 16; - dst2[1] = src_r0[2] >> 16 | src_r0[3] << 16; - dst2[2] = src_r0[3] >> 16; + w1[2] = w1[2] | append0[0] << 16; + w1[3] = append0[0] >> 16 | append0[1] << 16; + w2[0] = append0[1] >> 16 | append0[2] << 16; + w2[1] = append0[2] >> 16 | append0[3] << 16; + w2[2] = append0[3] >> 16 | append1[0] << 16; + w2[3] = append1[0] >> 16 | append1[1] << 16; + w3[0] = append1[1] >> 16 | append1[2] << 16; + w3[1] = append1[2] >> 16 | append1[3] << 16; break; case 27: - dst1[2] = src_l1[2] | src_r0[0] << 24; - dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24; - dst2[0] = src_r0[1] >> 8 | src_r0[2] << 24; - dst2[1] = src_r0[2] >> 8 | src_r0[3] << 24; - dst2[2] = src_r0[3] >> 8; + w1[2] = w1[2] | append0[0] << 24; + w1[3] = append0[0] >> 8 | append0[1] << 24; + w2[0] = append0[1] >> 8 | append0[2] << 24; + w2[1] = append0[2] >> 8 | append0[3] << 24; + w2[2] = append0[3] >> 8 | append1[0] << 24; + w2[3] = append1[0] >> 8 | append1[1] << 24; + w3[0] = append1[1] >> 8 | append1[2] << 24; + w3[1] = append1[2] >> 8 | append1[3] << 24; break; case 28: - dst1[3] = src_r0[0]; - dst2[0] = src_r0[1]; - dst2[1] = src_r0[2]; - dst2[2] = src_r0[3]; + w1[3] = append0[0]; + w2[0] = append0[1]; + w2[1] = append0[2]; + w2[2] = append0[3]; + w2[3] = append1[0]; + w3[0] = append1[1]; + w3[1] = append1[2]; break; case 29: - dst1[3] = src_l1[3] | src_r0[0] << 8; - dst2[0] = src_r0[0] >> 24 | src_r0[1] << 8; - dst2[1] = src_r0[1] >> 24 | src_r0[2] << 8; - dst2[2] = src_r0[2] >> 24 | src_r0[3] << 8; - dst2[3] = src_r0[3] >> 24; + w1[3] = w1[3] | append0[0] << 8; + w2[0] = append0[0] >> 24 | append0[1] << 8; + w2[1] = append0[1] >> 24 | append0[2] << 8; + w2[2] = append0[2] >> 24 | append0[3] << 8; + w2[3] = append0[3] >> 24 | append1[0] << 8; + w3[0] = append1[0] >> 24 | append1[1] << 8; + w3[1] = append1[1] >> 24 | append1[2] << 8; break; case 30: - dst1[3] = src_l1[3] | src_r0[0] << 16; - dst2[0] = src_r0[0] >> 16 | src_r0[1] << 16; - dst2[1] = src_r0[1] >> 16 | src_r0[2] << 16; - dst2[2] = src_r0[2] >> 16 | src_r0[3] << 16; - dst2[3] = src_r0[3] >> 16; + w1[3] = w1[3] | append0[0] << 16; + w2[0] = append0[0] >> 16 | append0[1] << 16; + w2[1] = append0[1] >> 16 | append0[2] << 16; + w2[2] = append0[2] >> 16 | append0[3] << 16; + w2[3] = append0[3] >> 16 | append1[0] << 16; + w3[0] = append1[0] >> 16 | append1[1] << 16; + w3[1] = append1[1] >> 16 | append1[2] << 16; break; case 31: - dst1[3] = src_l1[3] | src_r0[0] << 24; - dst2[0] = src_r0[0] >> 8 | src_r0[1] << 24; - dst2[1] = src_r0[1] >> 8 | src_r0[2] << 24; - dst2[2] = src_r0[2] >> 8 | src_r0[3] << 24; - dst2[3] = src_r0[3] >> 8; + w1[3] = w1[3] | append0[0] << 24; + w2[0] = append0[0] >> 8 | append0[1] << 24; + w2[1] = append0[1] >> 8 | append0[2] << 24; + w2[2] = append0[2] >> 8 | append0[3] << 24; + w2[3] = append0[3] >> 8 | append1[0] << 24; + w3[0] = append1[0] >> 8 | append1[1] << 24; + w3[1] = append1[1] >> 8 | append1[2] << 24; break; case 32: - dst2[0] = src_r0[0]; - dst2[1] = src_r0[1]; - dst2[2] = src_r0[2]; - dst2[3] = src_r0[3]; - break; - - case 33: - dst2[0] = src_l2[0] | src_r0[0] << 8; - dst2[1] = src_r0[0] >> 24 | src_r0[1] << 8; - dst2[2] = src_r0[1] >> 24 | src_r0[2] << 8; - dst2[3] = src_r0[2] >> 24 | src_r0[3] << 8; - break; - - case 34: - dst2[0] = src_l2[0] | src_r0[0] << 16; - dst2[1] = src_r0[0] >> 16 | src_r0[1] << 16; - dst2[2] = src_r0[1] >> 16 | src_r0[2] << 16; - dst2[3] = src_r0[2] >> 16 | src_r0[3] << 16; + w2[0] = append0[0]; + w2[1] = append0[1]; + w2[2] = append0[2]; + w2[3] = append0[3]; + w3[0] = append1[0]; + w3[1] = append1[1]; break; + } +} - case 35: - dst2[0] = src_l2[0] | src_r0[0] << 24; - dst2[1] = src_r0[0] >> 8 | src_r0[1] << 24; - dst2[2] = src_r0[1] >> 8 | src_r0[2] << 24; - dst2[3] = src_r0[2] >> 8 | src_r0[3] << 24; - break; +static void switch_buffer_by_offset (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) +{ + #ifdef IS_AMD + const int offset_mod_4 = offset & 3; - case 36: - dst2[1] = src_r0[0]; - dst2[2] = src_r0[1]; - dst2[3] = src_r0[2]; - break; + const int offset_minus_4 = 4 - offset; - case 37: - dst2[1] = src_l2[1] | src_r0[0] << 8; - dst2[2] = src_r0[0] >> 24 | src_r0[1] << 8; - dst2[3] = src_r0[1] >> 24 | src_r0[2] << 8; - break; + switch (offset / 4) + { + case 0: + w3[2] = amd_bytealign ( 0, w3[1], offset_minus_4); + w3[1] = amd_bytealign (w3[1], w3[0], offset_minus_4); + w3[0] = amd_bytealign (w3[0], w2[3], offset_minus_4); + w2[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); + w2[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); + w2[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); + w2[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); + w1[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); + w1[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); + w1[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); + w1[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); + w0[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); + w0[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); + w0[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); + w0[0] = amd_bytealign (w0[0], 0, offset_minus_4); - case 38: - dst2[1] = src_l2[1] | src_r0[0] << 16; - dst2[2] = src_r0[0] >> 16 | src_r0[1] << 16; - dst2[3] = src_r0[1] >> 16 | src_r0[2] << 16; - break; + if (offset_mod_4 == 0) + { + w0[0] = w0[1]; + w0[1] = w0[2]; + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } - case 39: - dst2[1] = src_l2[1] | src_r0[0] << 24; - dst2[2] = src_r0[0] >> 8 | src_r0[1] << 24; - dst2[3] = src_r0[1] >> 8 | src_r0[2] << 24; break; - case 40: - dst2[2] = src_r0[0]; - dst2[3] = src_r0[1]; + case 1: + w3[2] = amd_bytealign ( 0, w3[0], offset_minus_4); + w3[1] = amd_bytealign (w3[0], w2[3], offset_minus_4); + w3[0] = amd_bytealign (w2[3], w2[2], offset_minus_4); + w2[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); + w2[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); + w2[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); + w2[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); + w1[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); + w1[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); + w1[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); + w1[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); + w0[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); + w0[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); + w0[1] = amd_bytealign (w0[0], 0, offset_minus_4); + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w0[1] = w0[2]; + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } + + break; + + case 2: + w3[2] = amd_bytealign ( 0, w2[3], offset_minus_4); + w3[1] = amd_bytealign (w2[3], w2[2], offset_minus_4); + w3[0] = amd_bytealign (w2[2], w2[1], offset_minus_4); + w2[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); + w2[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); + w2[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); + w2[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); + w1[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); + w1[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); + w1[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); + w1[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); + w0[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); + w0[2] = amd_bytealign (w0[0], 0, offset_minus_4); + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } + + break; + + case 3: + w3[2] = amd_bytealign ( 0, w2[2], offset_minus_4); + w3[1] = amd_bytealign (w2[2], w2[1], offset_minus_4); + w3[0] = amd_bytealign (w2[1], w2[0], offset_minus_4); + w2[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); + w2[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); + w2[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); + w2[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); + w1[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); + w1[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); + w1[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); + w1[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); + w0[3] = amd_bytealign (w0[0], 0, offset_minus_4); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } + + break; + + case 4: + w3[2] = amd_bytealign ( 0, w2[1], offset_minus_4); + w3[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); + w3[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); + w2[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); + w2[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); + w2[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); + w2[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); + w1[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); + w1[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); + w1[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); + w1[0] = amd_bytealign (w0[0], 0, offset_minus_4); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } + + break; + + case 5: + w3[2] = amd_bytealign ( 0, w2[0], offset_minus_4); + w3[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); + w3[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); + w2[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); + w2[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); + w2[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); + w2[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); + w1[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); + w1[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); + w1[1] = amd_bytealign (w0[0], 0, offset_minus_4); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } + + break; + + case 6: + w3[2] = amd_bytealign ( 0, w1[3], offset_minus_4); + w3[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); + w3[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); + w2[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); + w2[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); + w2[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); + w2[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); + w1[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); + w1[2] = amd_bytealign (w0[0], 0, offset_minus_4); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } + + break; + + case 7: + w3[2] = amd_bytealign ( 0, w1[2], offset_minus_4); + w3[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); + w3[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); + w2[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); + w2[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); + w2[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); + w2[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); + w1[3] = amd_bytealign (w0[0], 0, offset_minus_4); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } + + break; + + case 8: + w3[2] = amd_bytealign ( 0, w1[1], offset_minus_4); + w3[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); + w3[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); + w2[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); + w2[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); + w2[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); + w2[0] = amd_bytealign (w0[0], 0, offset_minus_4); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } + + break; + + case 9: + w3[2] = amd_bytealign ( 0, w1[0], offset_minus_4); + w3[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); + w3[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); + w2[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); + w2[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); + w2[1] = amd_bytealign (w0[0], 0, offset_minus_4); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } + + break; + + case 10: + w3[2] = amd_bytealign ( 0, w0[3], offset_minus_4); + w3[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); + w3[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); + w2[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); + w2[2] = amd_bytealign (w0[0], 0, offset_minus_4); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } + + break; + + case 11: + w3[2] = amd_bytealign ( 0, w0[2], offset_minus_4); + w3[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); + w3[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); + w2[3] = amd_bytealign (w0[0], 0, offset_minus_4); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } + + break; + + case 12: + w3[2] = amd_bytealign ( 0, w0[1], offset_minus_4); + w3[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); + w3[0] = amd_bytealign (w0[0], 0, offset_minus_4); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } + + break; + + case 13: + w3[2] = amd_bytealign ( 0, w0[0], offset_minus_4); + w3[1] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w3[1] = w3[2]; + w3[2] = 0; + } + + break; + } + #endif + + #ifdef IS_NV + const int offset_minus_4 = 4 - (offset % 4); + + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + + switch (offset / 4) + { + case 0: + w3[1] = __byte_perm (w3[0], w3[1], selector); + w3[0] = __byte_perm (w2[3], w3[0], selector); + w2[3] = __byte_perm (w2[2], w2[3], selector); + w2[2] = __byte_perm (w2[1], w2[2], selector); + w2[1] = __byte_perm (w2[0], w2[1], selector); + w2[0] = __byte_perm (w1[3], w2[0], selector); + w1[3] = __byte_perm (w1[2], w1[3], selector); + w1[2] = __byte_perm (w1[1], w1[2], selector); + w1[1] = __byte_perm (w1[0], w1[1], selector); + w1[0] = __byte_perm (w0[3], w1[0], selector); + w0[3] = __byte_perm (w0[2], w0[3], selector); + w0[2] = __byte_perm (w0[1], w0[2], selector); + w0[1] = __byte_perm (w0[0], w0[1], selector); + w0[0] = __byte_perm ( 0, w0[0], selector); + + break; + + case 1: + w3[1] = __byte_perm (w2[3], w3[0], selector); + w3[0] = __byte_perm (w2[2], w2[3], selector); + w2[3] = __byte_perm (w2[1], w2[2], selector); + w2[2] = __byte_perm (w2[0], w2[1], selector); + w2[1] = __byte_perm (w1[3], w2[0], selector); + w2[0] = __byte_perm (w1[2], w1[3], selector); + w1[3] = __byte_perm (w1[1], w1[2], selector); + w1[2] = __byte_perm (w1[0], w1[1], selector); + w1[1] = __byte_perm (w0[3], w1[0], selector); + w1[0] = __byte_perm (w0[2], w0[3], selector); + w0[3] = __byte_perm (w0[1], w0[2], selector); + w0[2] = __byte_perm (w0[0], w0[1], selector); + w0[1] = __byte_perm ( 0, w0[0], selector); + w0[0] = 0; + + break; + + case 2: + w3[1] = __byte_perm (w2[2], w2[3], selector); + w3[0] = __byte_perm (w2[1], w2[2], selector); + w2[3] = __byte_perm (w2[0], w2[1], selector); + w2[2] = __byte_perm (w1[3], w2[0], selector); + w2[1] = __byte_perm (w1[2], w1[3], selector); + w2[0] = __byte_perm (w1[1], w1[2], selector); + w1[3] = __byte_perm (w1[0], w1[1], selector); + w1[2] = __byte_perm (w0[3], w1[0], selector); + w1[1] = __byte_perm (w0[2], w0[3], selector); + w1[0] = __byte_perm (w0[1], w0[2], selector); + w0[3] = __byte_perm (w0[0], w0[1], selector); + w0[2] = __byte_perm ( 0, w0[0], selector); + w0[1] = 0; + w0[0] = 0; + + break; + + case 3: + w3[1] = __byte_perm (w2[1], w2[2], selector); + w3[0] = __byte_perm (w2[0], w2[1], selector); + w2[3] = __byte_perm (w1[3], w2[0], selector); + w2[2] = __byte_perm (w1[2], w1[3], selector); + w2[1] = __byte_perm (w1[1], w1[2], selector); + w2[0] = __byte_perm (w1[0], w1[1], selector); + w1[3] = __byte_perm (w0[3], w1[0], selector); + w1[2] = __byte_perm (w0[2], w0[3], selector); + w1[1] = __byte_perm (w0[1], w0[2], selector); + w1[0] = __byte_perm (w0[0], w0[1], selector); + w0[3] = __byte_perm ( 0, w0[0], selector); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 4: + w3[1] = __byte_perm (w2[0], w2[1], selector); + w3[0] = __byte_perm (w1[3], w2[0], selector); + w2[3] = __byte_perm (w1[2], w1[3], selector); + w2[2] = __byte_perm (w1[1], w1[2], selector); + w2[1] = __byte_perm (w1[0], w1[1], selector); + w2[0] = __byte_perm (w0[3], w1[0], selector); + w1[3] = __byte_perm (w0[2], w0[3], selector); + w1[2] = __byte_perm (w0[1], w0[2], selector); + w1[1] = __byte_perm (w0[0], w0[1], selector); + w1[0] = __byte_perm ( 0, w0[0], selector); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; - case 41: - dst2[2] = src_l2[2] | src_r0[0] << 8; - dst2[3] = src_r0[0] >> 24 | src_r0[1] << 8; + case 5: + w3[1] = __byte_perm (w1[3], w2[0], selector); + w3[0] = __byte_perm (w1[2], w1[3], selector); + w2[3] = __byte_perm (w1[1], w1[2], selector); + w2[2] = __byte_perm (w1[0], w1[1], selector); + w2[1] = __byte_perm (w0[3], w1[0], selector); + w2[0] = __byte_perm (w0[2], w0[3], selector); + w1[3] = __byte_perm (w0[1], w0[2], selector); + w1[2] = __byte_perm (w0[0], w0[1], selector); + w1[1] = __byte_perm ( 0, w0[0], selector); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; - case 42: - dst2[2] = src_l2[2] | src_r0[0] << 16; - dst2[3] = src_r0[0] >> 16 | src_r0[1] << 16; + case 6: + w3[1] = __byte_perm (w1[2], w1[3], selector); + w3[0] = __byte_perm (w1[1], w1[2], selector); + w2[3] = __byte_perm (w1[0], w1[1], selector); + w2[2] = __byte_perm (w0[3], w1[0], selector); + w2[1] = __byte_perm (w0[2], w0[3], selector); + w2[0] = __byte_perm (w0[1], w0[2], selector); + w1[3] = __byte_perm (w0[0], w0[1], selector); + w1[2] = __byte_perm ( 0, w0[0], selector); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; - case 43: - dst2[2] = src_l2[2] | src_r0[0] << 24; - dst2[3] = src_r0[0] >> 8 | src_r0[1] << 24; + case 7: + w3[1] = __byte_perm (w1[1], w1[2], selector); + w3[0] = __byte_perm (w1[0], w1[1], selector); + w2[3] = __byte_perm (w0[3], w1[0], selector); + w2[2] = __byte_perm (w0[2], w0[3], selector); + w2[1] = __byte_perm (w0[1], w0[2], selector); + w2[0] = __byte_perm (w0[0], w0[1], selector); + w1[3] = __byte_perm ( 0, w0[0], selector); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; - case 44: - dst2[3] = src_r0[0]; + case 8: + w3[1] = __byte_perm (w1[0], w1[1], selector); + w3[0] = __byte_perm (w0[3], w1[0], selector); + w2[3] = __byte_perm (w0[2], w0[3], selector); + w2[2] = __byte_perm (w0[1], w0[2], selector); + w2[1] = __byte_perm (w0[0], w0[1], selector); + w2[0] = __byte_perm ( 0, w0[0], selector); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; - case 45: - dst2[3] = src_l2[3] | src_r0[0] << 8; + case 9: + w3[1] = __byte_perm (w0[3], w1[0], selector); + w3[0] = __byte_perm (w0[2], w0[3], selector); + w2[3] = __byte_perm (w0[1], w0[2], selector); + w2[2] = __byte_perm (w0[0], w0[1], selector); + w2[1] = __byte_perm ( 0, w0[0], selector); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; - case 46: - dst2[3] = src_l2[3] | src_r0[0] << 16; + case 10: + w3[1] = __byte_perm (w0[2], w0[3], selector); + w3[0] = __byte_perm (w0[1], w0[2], selector); + w2[3] = __byte_perm (w0[0], w0[1], selector); + w2[2] = __byte_perm ( 0, w0[0], selector); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; - case 47: - dst2[3] = src_l2[3] | src_r0[0] << 24; + case 11: + w3[1] = __byte_perm (w0[1], w0[2], selector); + w3[0] = __byte_perm (w0[0], w0[1], selector); + w2[3] = __byte_perm ( 0, w0[0], selector); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 12: + w3[1] = __byte_perm (w0[0], w0[1], selector); + w3[0] = __byte_perm ( 0, w0[0], selector); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 13: + w3[1] = __byte_perm ( 0, w0[0], selector); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; } + #endif } -// before: device_memcat12L -static void memcat_c47_d3x4_sl3x4_sr2x4 (const u32 offset, u32 dst0[4], u32 dst1[4], u32 dst2[4], u32 src_l0[4], u32 src_l1[4], u32 src_l2[4], u32 src_r0[4], u32 src_r1[4]) +static void switch_buffer_by_offset_be (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) { - switch (offset) + #ifdef IS_AMD + switch (offset / 4) { case 0: - dst0[0] = src_r0[0]; - dst0[1] = src_r0[1]; - dst0[2] = src_r0[2]; - dst0[3] = src_r0[3]; - dst1[0] = src_r1[0]; - dst1[1] = src_r1[1]; - dst1[2] = src_r1[2]; - dst1[3] = src_r1[3]; + w3[2] = amd_bytealign (w3[1], 0, offset); + w3[1] = amd_bytealign (w3[0], w3[1], offset); + w3[0] = amd_bytealign (w2[3], w3[0], offset); + w2[3] = amd_bytealign (w2[2], w2[3], offset); + w2[2] = amd_bytealign (w2[1], w2[2], offset); + w2[1] = amd_bytealign (w2[0], w2[1], offset); + w2[0] = amd_bytealign (w1[3], w2[0], offset); + w1[3] = amd_bytealign (w1[2], w1[3], offset); + w1[2] = amd_bytealign (w1[1], w1[2], offset); + w1[1] = amd_bytealign (w1[0], w1[1], offset); + w1[0] = amd_bytealign (w0[3], w1[0], offset); + w0[3] = amd_bytealign (w0[2], w0[3], offset); + w0[2] = amd_bytealign (w0[1], w0[2], offset); + w0[1] = amd_bytealign (w0[0], w0[1], offset); + w0[0] = amd_bytealign ( 0, w0[0], offset); break; case 1: - dst0[0] = src_l0[0] | src_r0[0] << 8; - dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8; - dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8; - dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[0] = src_r0[3] >> 24 | src_r1[0] << 8; - dst1[1] = src_r1[0] >> 24 | src_r1[1] << 8; - dst1[2] = src_r1[1] >> 24 | src_r1[2] << 8; - dst1[3] = src_r1[2] >> 24 | src_r1[3] << 8; - dst2[0] = src_r1[3] >> 24; + w3[2] = amd_bytealign (w3[0], 0, offset); + w3[1] = amd_bytealign (w2[3], w3[0], offset); + w3[0] = amd_bytealign (w2[2], w2[3], offset); + w2[3] = amd_bytealign (w2[1], w2[2], offset); + w2[2] = amd_bytealign (w2[0], w2[1], offset); + w2[1] = amd_bytealign (w1[3], w2[0], offset); + w2[0] = amd_bytealign (w1[2], w1[3], offset); + w1[3] = amd_bytealign (w1[1], w1[2], offset); + w1[2] = amd_bytealign (w1[0], w1[1], offset); + w1[1] = amd_bytealign (w0[3], w1[0], offset); + w1[0] = amd_bytealign (w0[2], w0[3], offset); + w0[3] = amd_bytealign (w0[1], w0[2], offset); + w0[2] = amd_bytealign (w0[0], w0[1], offset); + w0[1] = amd_bytealign ( 0, w0[0], offset); + w0[0] = 0; break; case 2: - dst0[0] = src_l0[0] | src_r0[0] << 16; - dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16; - dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16; - dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[0] = src_r0[3] >> 16 | src_r1[0] << 16; - dst1[1] = src_r1[0] >> 16 | src_r1[1] << 16; - dst1[2] = src_r1[1] >> 16 | src_r1[2] << 16; - dst1[3] = src_r1[2] >> 16 | src_r1[3] << 16; - dst2[0] = src_r1[3] >> 16; + w3[2] = amd_bytealign (w2[3], 0, offset); + w3[1] = amd_bytealign (w2[2], w2[3], offset); + w3[0] = amd_bytealign (w2[1], w2[2], offset); + w2[3] = amd_bytealign (w2[0], w2[1], offset); + w2[2] = amd_bytealign (w1[3], w2[0], offset); + w2[1] = amd_bytealign (w1[2], w1[3], offset); + w2[0] = amd_bytealign (w1[1], w1[2], offset); + w1[3] = amd_bytealign (w1[0], w1[1], offset); + w1[2] = amd_bytealign (w0[3], w1[0], offset); + w1[1] = amd_bytealign (w0[2], w0[3], offset); + w1[0] = amd_bytealign (w0[1], w0[2], offset); + w0[3] = amd_bytealign (w0[0], w0[1], offset); + w0[2] = amd_bytealign ( 0, w0[0], offset); + w0[1] = 0; + w0[0] = 0; break; case 3: - dst0[0] = src_l0[0] | src_r0[0] << 24; - dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24; - dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24; - dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[0] = src_r0[3] >> 8 | src_r1[0] << 24; - dst1[1] = src_r1[0] >> 8 | src_r1[1] << 24; - dst1[2] = src_r1[1] >> 8 | src_r1[2] << 24; - dst1[3] = src_r1[2] >> 8 | src_r1[3] << 24; - dst2[0] = src_r1[3] >> 8; + w3[2] = amd_bytealign (w2[2], 0, offset); + w3[1] = amd_bytealign (w2[1], w2[2], offset); + w3[0] = amd_bytealign (w2[0], w2[1], offset); + w2[3] = amd_bytealign (w1[3], w2[0], offset); + w2[2] = amd_bytealign (w1[2], w1[3], offset); + w2[1] = amd_bytealign (w1[1], w1[2], offset); + w2[0] = amd_bytealign (w1[0], w1[1], offset); + w1[3] = amd_bytealign (w0[3], w1[0], offset); + w1[2] = amd_bytealign (w0[2], w0[3], offset); + w1[1] = amd_bytealign (w0[1], w0[2], offset); + w1[0] = amd_bytealign (w0[0], w0[1], offset); + w0[3] = amd_bytealign ( 0, w0[0], offset); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 4: - dst0[1] = src_r0[0]; - dst0[2] = src_r0[1]; - dst0[3] = src_r0[2]; - dst1[0] = src_r0[3]; - dst1[1] = src_r1[0]; - dst1[2] = src_r1[1]; - dst1[3] = src_r1[2]; - dst2[0] = src_r1[3]; + w3[2] = amd_bytealign (w2[1], 0, offset); + w3[1] = amd_bytealign (w2[0], w2[1], offset); + w3[0] = amd_bytealign (w1[3], w2[0], offset); + w2[3] = amd_bytealign (w1[2], w1[3], offset); + w2[2] = amd_bytealign (w1[1], w1[2], offset); + w2[1] = amd_bytealign (w1[0], w1[1], offset); + w2[0] = amd_bytealign (w0[3], w1[0], offset); + w1[3] = amd_bytealign (w0[2], w0[3], offset); + w1[2] = amd_bytealign (w0[1], w0[2], offset); + w1[1] = amd_bytealign (w0[0], w0[1], offset); + w1[0] = amd_bytealign ( 0, w0[0], offset); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 5: - dst0[1] = src_l0[1] | src_r0[0] << 8; - dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8; - dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[1] = src_r0[3] >> 24 | src_r1[0] << 8; - dst1[2] = src_r1[0] >> 24 | src_r1[1] << 8; - dst1[3] = src_r1[1] >> 24 | src_r1[2] << 8; - dst2[0] = src_r1[2] >> 24 | src_r1[3] << 8; - dst2[1] = src_r1[3] >> 24; + w3[2] = amd_bytealign (w2[0], 0, offset); + w3[1] = amd_bytealign (w1[3], w2[0], offset); + w3[0] = amd_bytealign (w1[2], w1[3], offset); + w2[3] = amd_bytealign (w1[1], w1[2], offset); + w2[2] = amd_bytealign (w1[0], w1[1], offset); + w2[1] = amd_bytealign (w0[3], w1[0], offset); + w2[0] = amd_bytealign (w0[2], w0[3], offset); + w1[3] = amd_bytealign (w0[1], w0[2], offset); + w1[2] = amd_bytealign (w0[0], w0[1], offset); + w1[1] = amd_bytealign ( 0, w0[0], offset); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 6: - dst0[1] = src_l0[1] | src_r0[0] << 16; - dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16; - dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[1] = src_r0[3] >> 16 | src_r1[0] << 16; - dst1[2] = src_r1[0] >> 16 | src_r1[1] << 16; - dst1[3] = src_r1[1] >> 16 | src_r1[2] << 16; - dst2[0] = src_r1[2] >> 16 | src_r1[3] << 16; - dst2[1] = src_r1[3] >> 16; + w3[2] = amd_bytealign (w1[3], 0, offset); + w3[1] = amd_bytealign (w1[2], w1[3], offset); + w3[0] = amd_bytealign (w1[1], w1[2], offset); + w2[3] = amd_bytealign (w1[0], w1[1], offset); + w2[2] = amd_bytealign (w0[3], w1[0], offset); + w2[1] = amd_bytealign (w0[2], w0[3], offset); + w2[0] = amd_bytealign (w0[1], w0[2], offset); + w1[3] = amd_bytealign (w0[0], w0[1], offset); + w1[2] = amd_bytealign ( 0, w0[0], offset); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 7: - dst0[1] = src_l0[1] | src_r0[0] << 24; - dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24; - dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[1] = src_r0[3] >> 8 | src_r1[0] << 24; - dst1[2] = src_r1[0] >> 8 | src_r1[1] << 24; - dst1[3] = src_r1[1] >> 8 | src_r1[2] << 24; - dst2[0] = src_r1[2] >> 8 | src_r1[3] << 24; - dst2[1] = src_r1[3] >> 8; + w3[2] = amd_bytealign (w1[2], 0, offset); + w3[1] = amd_bytealign (w1[1], w1[2], offset); + w3[0] = amd_bytealign (w1[0], w1[1], offset); + w2[3] = amd_bytealign (w0[3], w1[0], offset); + w2[2] = amd_bytealign (w0[2], w0[3], offset); + w2[1] = amd_bytealign (w0[1], w0[2], offset); + w2[0] = amd_bytealign (w0[0], w0[1], offset); + w1[3] = amd_bytealign ( 0, w0[0], offset); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 8: - dst0[2] = src_r0[0]; - dst0[3] = src_r0[1]; - dst1[0] = src_r0[2]; - dst1[1] = src_r0[3]; - dst1[2] = src_r1[0]; - dst1[3] = src_r1[1]; - dst2[0] = src_r1[2]; - dst2[1] = src_r1[3]; + w3[2] = amd_bytealign (w1[1], 0, offset); + w3[1] = amd_bytealign (w1[0], w1[1], offset); + w3[0] = amd_bytealign (w0[3], w1[0], offset); + w2[3] = amd_bytealign (w0[2], w0[3], offset); + w2[2] = amd_bytealign (w0[1], w0[2], offset); + w2[1] = amd_bytealign (w0[0], w0[1], offset); + w2[0] = amd_bytealign ( 0, w0[0], offset); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 9: - dst0[2] = src_l0[2] | src_r0[0] << 8; - dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[2] = src_r0[3] >> 24 | src_r1[0] << 8; - dst1[3] = src_r1[0] >> 24 | src_r1[1] << 8; - dst2[0] = src_r1[1] >> 24 | src_r1[2] << 8; - dst2[1] = src_r1[2] >> 24 | src_r1[3] << 8; - dst2[2] = src_r1[3] >> 24; + w3[2] = amd_bytealign (w1[0], 0, offset); + w3[1] = amd_bytealign (w0[3], w1[0], offset); + w3[0] = amd_bytealign (w0[2], w0[3], offset); + w2[3] = amd_bytealign (w0[1], w0[2], offset); + w2[2] = amd_bytealign (w0[0], w0[1], offset); + w2[1] = amd_bytealign ( 0, w0[0], offset); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 10: - dst0[2] = src_l0[2] | src_r0[0] << 16; - dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[2] = src_r0[3] >> 16 | src_r1[0] << 16; - dst1[3] = src_r1[0] >> 16 | src_r1[1] << 16; - dst2[0] = src_r1[1] >> 16 | src_r1[2] << 16; - dst2[1] = src_r1[2] >> 16 | src_r1[3] << 16; - dst2[2] = src_r1[3] >> 16; + w3[2] = amd_bytealign (w0[3], 0, offset); + w3[1] = amd_bytealign (w0[2], w0[3], offset); + w3[0] = amd_bytealign (w0[1], w0[2], offset); + w2[3] = amd_bytealign (w0[0], w0[1], offset); + w2[2] = amd_bytealign ( 0, w0[0], offset); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 11: - dst0[2] = src_l0[2] | src_r0[0] << 24; - dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[2] = src_r0[3] >> 8 | src_r1[0] << 24; - dst1[3] = src_r1[0] >> 8 | src_r1[1] << 24; - dst2[0] = src_r1[1] >> 8 | src_r1[2] << 24; - dst2[1] = src_r1[2] >> 8 | src_r1[3] << 24; - dst2[2] = src_r1[3] >> 8; - break; - - case 12: - dst0[3] = src_r0[0]; - dst1[0] = src_r0[1]; - dst1[1] = src_r0[2]; - dst1[2] = src_r0[3]; - dst1[3] = src_r1[0]; - dst2[0] = src_r1[1]; - dst2[1] = src_r1[2]; - dst2[2] = src_r1[3]; - break; - - case 13: - dst0[3] = src_l0[3] | src_r0[0] << 8; - dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[3] = src_r0[3] >> 24 | src_r1[0] << 8; - dst2[0] = src_r1[0] >> 24 | src_r1[1] << 8; - dst2[1] = src_r1[1] >> 24 | src_r1[2] << 8; - dst2[2] = src_r1[2] >> 24 | src_r1[3] << 8; - dst2[3] = src_r1[3] >> 24; - break; - - case 14: - dst0[3] = src_l0[3] | src_r0[0] << 16; - dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[3] = src_r0[3] >> 16 | src_r1[0] << 16; - dst2[0] = src_r1[0] >> 16 | src_r1[1] << 16; - dst2[1] = src_r1[1] >> 16 | src_r1[2] << 16; - dst2[2] = src_r1[2] >> 16 | src_r1[3] << 16; - dst2[3] = src_r1[3] >> 16; - break; - - case 15: - dst0[3] = src_l0[3] | src_r0[0] << 24; - dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[3] = src_r0[3] >> 8 | src_r1[0] << 24; - dst2[0] = src_r1[0] >> 8 | src_r1[1] << 24; - dst2[1] = src_r1[1] >> 8 | src_r1[2] << 24; - dst2[2] = src_r1[2] >> 8 | src_r1[3] << 24; - dst2[3] = src_r1[3] >> 8; - break; - - case 16: - dst1[0] = src_r0[0]; - dst1[1] = src_r0[1]; - dst1[2] = src_r0[2]; - dst1[3] = src_r0[3]; - dst2[0] = src_r1[0]; - dst2[1] = src_r1[1]; - dst2[2] = src_r1[2]; - dst2[3] = src_r1[3]; + w3[2] = amd_bytealign (w0[2], 0, offset); + w3[1] = amd_bytealign (w0[1], w0[2], offset); + w3[0] = amd_bytealign (w0[0], w0[1], offset); + w2[3] = amd_bytealign ( 0, w0[0], offset); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 17: - dst1[0] = src_l1[0] | src_r0[0] << 8; - dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8; - dst2[0] = src_r0[3] >> 24 | src_r1[0] << 8; - dst2[1] = src_r1[0] >> 24 | src_r1[1] << 8; - dst2[2] = src_r1[1] >> 24 | src_r1[2] << 8; - dst2[3] = src_r1[2] >> 24 | src_r1[3] << 8; + case 12: + w3[2] = amd_bytealign (w0[1], 0, offset); + w3[1] = amd_bytealign (w0[0], w0[1], offset); + w3[0] = amd_bytealign ( 0, w0[0], offset); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 18: - dst1[0] = src_l1[0] | src_r0[0] << 16; - dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16; - dst2[0] = src_r0[3] >> 16 | src_r1[0] << 16; - dst2[1] = src_r1[0] >> 16 | src_r1[1] << 16; - dst2[2] = src_r1[1] >> 16 | src_r1[2] << 16; - dst2[3] = src_r1[2] >> 16 | src_r1[3] << 16; + case 13: + w3[2] = amd_bytealign (w0[0], 0, offset); + w3[1] = amd_bytealign ( 0, w0[0], offset); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; + } + #endif - case 19: - dst1[0] = src_l1[0] | src_r0[0] << 24; - dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24; - dst2[0] = src_r0[3] >> 8 | src_r1[0] << 24; - dst2[1] = src_r1[0] >> 8 | src_r1[1] << 24; - dst2[2] = src_r1[1] >> 8 | src_r1[2] << 24; - dst2[3] = src_r1[2] >> 8 | src_r1[3] << 24; - break; + #ifdef IS_NV + const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; - case 20: - dst1[1] = src_r1[0]; - dst1[2] = src_r0[1]; - dst1[3] = src_r0[2]; - dst2[0] = src_r0[3]; - dst2[1] = src_r1[0]; - dst2[2] = src_r1[1]; - dst2[3] = src_r1[2]; + switch (offset / 4) + { + case 0: + w3[1] = __byte_perm (w3[1], w3[0], selector); + w3[0] = __byte_perm (w3[0], w2[3], selector); + w2[3] = __byte_perm (w2[3], w2[2], selector); + w2[2] = __byte_perm (w2[2], w2[1], selector); + w2[1] = __byte_perm (w2[1], w2[0], selector); + w2[0] = __byte_perm (w2[0], w1[3], selector); + w1[3] = __byte_perm (w1[3], w1[2], selector); + w1[2] = __byte_perm (w1[2], w1[1], selector); + w1[1] = __byte_perm (w1[1], w1[0], selector); + w1[0] = __byte_perm (w1[0], w0[3], selector); + w0[3] = __byte_perm (w0[3], w0[2], selector); + w0[2] = __byte_perm (w0[2], w0[1], selector); + w0[1] = __byte_perm (w0[1], w0[0], selector); + w0[0] = __byte_perm (w0[0], 0, selector); break; - case 21: - dst1[1] = src_l1[1] | src_r0[0] << 8; - dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8; - dst2[0] = src_r0[2] >> 24 | src_r0[3] << 8; - dst2[1] = src_r0[3] >> 24 | src_r1[0] << 8; - dst2[2] = src_r1[0] >> 24 | src_r1[1] << 8; - dst2[3] = src_r1[1] >> 24 | src_r1[2] << 8; + case 1: + w3[1] = __byte_perm (w3[0], w2[3], selector); + w3[0] = __byte_perm (w2[3], w2[2], selector); + w2[3] = __byte_perm (w2[2], w2[1], selector); + w2[2] = __byte_perm (w2[1], w2[0], selector); + w2[1] = __byte_perm (w2[0], w1[3], selector); + w2[0] = __byte_perm (w1[3], w1[2], selector); + w1[3] = __byte_perm (w1[2], w1[1], selector); + w1[2] = __byte_perm (w1[1], w1[0], selector); + w1[1] = __byte_perm (w1[0], w0[3], selector); + w1[0] = __byte_perm (w0[3], w0[2], selector); + w0[3] = __byte_perm (w0[2], w0[1], selector); + w0[2] = __byte_perm (w0[1], w0[0], selector); + w0[1] = __byte_perm (w0[0], 0, selector); + w0[0] = 0; break; - case 22: - dst1[1] = src_l1[1] | src_r0[0] << 16; - dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16; - dst2[0] = src_r0[2] >> 16 | src_r0[3] << 16; - dst2[1] = src_r0[3] >> 16 | src_r1[0] << 16; - dst2[2] = src_r1[0] >> 16 | src_r1[1] << 16; - dst2[3] = src_r1[1] >> 16 | src_r1[2] << 16; + case 2: + w3[1] = __byte_perm (w2[3], w2[2], selector); + w3[0] = __byte_perm (w2[2], w2[1], selector); + w2[3] = __byte_perm (w2[1], w2[0], selector); + w2[2] = __byte_perm (w2[0], w1[3], selector); + w2[1] = __byte_perm (w1[3], w1[2], selector); + w2[0] = __byte_perm (w1[2], w1[1], selector); + w1[3] = __byte_perm (w1[1], w1[0], selector); + w1[2] = __byte_perm (w1[0], w0[3], selector); + w1[1] = __byte_perm (w0[3], w0[2], selector); + w1[0] = __byte_perm (w0[2], w0[1], selector); + w0[3] = __byte_perm (w0[1], w0[0], selector); + w0[2] = __byte_perm (w0[0], 0, selector); + w0[1] = 0; + w0[0] = 0; break; - case 23: - dst1[1] = src_l1[1] | src_r0[0] << 24; - dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24; - dst2[0] = src_r0[2] >> 8 | src_r0[3] << 24; - dst2[1] = src_r0[3] >> 8 | src_r1[0] << 24; - dst2[2] = src_r1[0] >> 8 | src_r1[1] << 24; - dst2[3] = src_r1[1] >> 8 | src_r1[2] << 24; + case 3: + w3[1] = __byte_perm (w2[2], w2[1], selector); + w3[0] = __byte_perm (w2[1], w2[0], selector); + w2[3] = __byte_perm (w2[0], w1[3], selector); + w2[2] = __byte_perm (w1[3], w1[2], selector); + w2[1] = __byte_perm (w1[2], w1[1], selector); + w2[0] = __byte_perm (w1[1], w1[0], selector); + w1[3] = __byte_perm (w1[0], w0[3], selector); + w1[2] = __byte_perm (w0[3], w0[2], selector); + w1[1] = __byte_perm (w0[2], w0[1], selector); + w1[0] = __byte_perm (w0[1], w0[0], selector); + w0[3] = __byte_perm (w0[0], 0, selector); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 24: - dst1[2] = src_r1[0]; - dst1[3] = src_r0[1]; - dst2[0] = src_r0[2]; - dst2[1] = src_r0[3]; - dst2[2] = src_r1[0]; - dst2[3] = src_r1[1]; + case 4: + w3[1] = __byte_perm (w2[1], w2[0], selector); + w3[0] = __byte_perm (w2[0], w1[3], selector); + w2[3] = __byte_perm (w1[3], w1[2], selector); + w2[2] = __byte_perm (w1[2], w1[1], selector); + w2[1] = __byte_perm (w1[1], w1[0], selector); + w2[0] = __byte_perm (w1[0], w0[3], selector); + w1[3] = __byte_perm (w0[3], w0[2], selector); + w1[2] = __byte_perm (w0[2], w0[1], selector); + w1[1] = __byte_perm (w0[1], w0[0], selector); + w1[0] = __byte_perm (w0[0], 0, selector); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 25: - dst1[2] = src_l1[2] | src_r0[0] << 8; - dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8; - dst2[0] = src_r0[1] >> 24 | src_r0[2] << 8; - dst2[1] = src_r0[2] >> 24 | src_r0[3] << 8; - dst2[2] = src_r0[3] >> 24 | src_r1[0] << 8; - dst2[3] = src_r1[0] >> 24 | src_r1[1] << 8; + case 5: + w3[1] = __byte_perm (w2[0], w1[3], selector); + w3[0] = __byte_perm (w1[3], w1[2], selector); + w2[3] = __byte_perm (w1[2], w1[1], selector); + w2[2] = __byte_perm (w1[1], w1[0], selector); + w2[1] = __byte_perm (w1[0], w0[3], selector); + w2[0] = __byte_perm (w0[3], w0[2], selector); + w1[3] = __byte_perm (w0[2], w0[1], selector); + w1[2] = __byte_perm (w0[1], w0[0], selector); + w1[1] = __byte_perm (w0[0], 0, selector); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 26: - dst1[2] = src_l1[2] | src_r0[0] << 16; - dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16; - dst2[0] = src_r0[1] >> 16 | src_r0[2] << 16; - dst2[1] = src_r0[2] >> 16 | src_r0[3] << 16; - dst2[2] = src_r0[3] >> 16 | src_r1[0] << 16; - dst2[3] = src_r1[0] >> 16 | src_r1[1] << 16; + case 6: + w3[1] = __byte_perm (w1[3], w1[2], selector); + w3[0] = __byte_perm (w1[2], w1[1], selector); + w2[3] = __byte_perm (w1[1], w1[0], selector); + w2[2] = __byte_perm (w1[0], w0[3], selector); + w2[1] = __byte_perm (w0[3], w0[2], selector); + w2[0] = __byte_perm (w0[2], w0[1], selector); + w1[3] = __byte_perm (w0[1], w0[0], selector); + w1[2] = __byte_perm (w0[0], 0, selector); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 27: - dst1[2] = src_l1[2] | src_r0[0] << 24; - dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24; - dst2[0] = src_r0[1] >> 8 | src_r0[2] << 24; - dst2[1] = src_r0[2] >> 8 | src_r0[3] << 24; - dst2[2] = src_r0[3] >> 8 | src_r1[0] << 24; - dst2[3] = src_r1[0] >> 8 | src_r1[1] << 24; + case 7: + w3[1] = __byte_perm (w1[2], w1[1], selector); + w3[0] = __byte_perm (w1[1], w1[0], selector); + w2[3] = __byte_perm (w1[0], w0[3], selector); + w2[2] = __byte_perm (w0[3], w0[2], selector); + w2[1] = __byte_perm (w0[2], w0[1], selector); + w2[0] = __byte_perm (w0[1], w0[0], selector); + w1[3] = __byte_perm (w0[0], 0, selector); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 28: - dst1[3] = src_r1[0]; - dst2[0] = src_r0[1]; - dst2[1] = src_r0[2]; - dst2[2] = src_r0[3]; - dst2[3] = src_r1[0]; + case 8: + w3[1] = __byte_perm (w1[1], w1[0], selector); + w3[0] = __byte_perm (w1[0], w0[3], selector); + w2[3] = __byte_perm (w0[3], w0[2], selector); + w2[2] = __byte_perm (w0[2], w0[1], selector); + w2[1] = __byte_perm (w0[1], w0[0], selector); + w2[0] = __byte_perm (w0[0], 0, selector); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 29: - dst1[3] = src_l1[3] | src_r0[0] << 8; - dst2[0] = src_r0[0] >> 24 | src_r0[1] << 8; - dst2[1] = src_r0[1] >> 24 | src_r0[2] << 8; - dst2[2] = src_r0[2] >> 24 | src_r0[3] << 8; - dst2[3] = src_r0[3] >> 24 | src_r1[0] << 8; + case 9: + w3[1] = __byte_perm (w1[0], w0[3], selector); + w3[0] = __byte_perm (w0[3], w0[2], selector); + w2[3] = __byte_perm (w0[2], w0[1], selector); + w2[2] = __byte_perm (w0[1], w0[0], selector); + w2[1] = __byte_perm (w0[0], 0, selector); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 30: - dst1[3] = src_l1[3] | src_r0[0] << 16; - dst2[0] = src_r0[0] >> 16 | src_r0[1] << 16; - dst2[1] = src_r0[1] >> 16 | src_r0[2] << 16; - dst2[2] = src_r0[2] >> 16 | src_r0[3] << 16; - dst2[3] = src_r0[3] >> 16 | src_r1[0] << 16; + case 10: + w3[1] = __byte_perm (w0[3], w0[2], selector); + w3[0] = __byte_perm (w0[2], w0[1], selector); + w2[3] = __byte_perm (w0[1], w0[0], selector); + w2[2] = __byte_perm (w0[0], 0, selector); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 31: - dst1[3] = src_l1[3] | src_r0[0] << 24; - dst2[0] = src_r0[0] >> 8 | src_r0[1] << 24; - dst2[1] = src_r0[1] >> 8 | src_r0[2] << 24; - dst2[2] = src_r0[2] >> 8 | src_r0[3] << 24; - dst2[3] = src_r0[3] >> 8 | src_r1[0] << 24; + case 11: + w3[1] = __byte_perm (w0[2], w0[1], selector); + w3[0] = __byte_perm (w0[1], w0[0], selector); + w2[3] = __byte_perm (w0[0], 0, selector); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 32: - dst2[0] = src_r0[0]; - dst2[1] = src_r0[1]; - dst2[2] = src_r0[2]; - dst2[3] = src_r0[3]; + case 12: + w3[1] = __byte_perm (w0[1], w0[0], selector); + w3[0] = __byte_perm (w0[0], 0, selector); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; - case 33: - dst2[0] = src_l2[0] | src_r0[0] << 8; - dst2[1] = src_r0[0] >> 24 | src_r0[1] << 8; - dst2[2] = src_r0[1] >> 24 | src_r0[2] << 8; - dst2[3] = src_r0[2] >> 24 | src_r0[3] << 8; + case 13: + w3[1] = __byte_perm (w0[0], 0, selector); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; + } + #endif +} - case 34: - dst2[0] = src_l2[0] | src_r0[0] << 16; - dst2[1] = src_r0[0] >> 16 | src_r0[1] << 16; - dst2[2] = src_r0[1] >> 16 | src_r0[2] << 16; - dst2[3] = src_r0[2] >> 16 | src_r0[3] << 16; +/* not needed anymore? +// before: append_0x80_2_be +static void append_0x80_2x4_be (u32 w0[4], u32 w1[4], const u32 offset) +{ + switch (offset) + { + case 0: + w0[0] |= 0x80000000; break; - case 35: - dst2[0] = src_l2[0] | src_r0[0] << 24; - dst2[1] = src_r0[0] >> 8 | src_r0[1] << 24; - dst2[2] = src_r0[1] >> 8 | src_r0[2] << 24; - dst2[3] = src_r0[2] >> 8 | src_r0[3] << 24; + case 1: + w0[0] |= 0x800000; break; - case 36: - dst2[1] = src_r0[0]; - dst2[2] = src_r0[1]; - dst2[3] = src_r0[2]; + case 2: + w0[0] |= 0x8000; break; - case 37: - dst2[1] = src_l2[1] | src_r0[0] << 8; - dst2[2] = src_r0[0] >> 24 | src_r0[1] << 8; - dst2[3] = src_r0[1] >> 24 | src_r0[2] << 8; + case 3: + w0[0] |= 0x80; break; - case 38: - dst2[1] = src_l2[1] | src_r0[0] << 16; - dst2[2] = src_r0[0] >> 16 | src_r0[1] << 16; - dst2[3] = src_r0[1] >> 16 | src_r0[2] << 16; + case 4: + w0[1] |= 0x80000000; break; - case 39: - dst2[1] = src_l2[1] | src_r0[0] << 24; - dst2[2] = src_r0[0] >> 8 | src_r0[1] << 24; - dst2[3] = src_r0[1] >> 8 | src_r0[2] << 24; + case 5: + w0[1] |= 0x800000; break; - case 40: - dst2[2] = src_r0[0]; - dst2[3] = src_r0[1]; + case 6: + w0[1] |= 0x8000; break; - case 41: - dst2[2] = src_l2[2] | src_r0[0] << 8; - dst2[3] = src_r0[0] >> 24 | src_r0[1] << 8; + case 7: + w0[1] |= 0x80; break; - case 42: - dst2[2] = src_l2[2] | src_r0[0] << 16; - dst2[3] = src_r0[0] >> 16 | src_r0[1] << 16; + case 8: + w0[2] |= 0x80000000; break; - case 43: - dst2[2] = src_l2[2] | src_r0[0] << 24; - dst2[3] = src_r0[0] >> 8 | src_r0[1] << 24; + case 9: + w0[2] |= 0x800000; break; - case 44: - dst2[3] = src_r0[0]; + case 10: + w0[2] |= 0x8000; break; - case 45: - dst2[3] = src_l2[3] | src_r0[0] << 8; + case 11: + w0[2] |= 0x80; break; - case 46: - dst2[3] = src_l2[3] | src_r0[0] << 16; + case 12: + w0[3] |= 0x80000000; break; - case 47: - dst2[3] = src_l2[3] | src_r0[0] << 24; + case 13: + w0[3] |= 0x800000; break; - } -} -// before: memcat16_9 -static void memcat_c15_w4x4_a3x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 append0[4], const u32 append1[4], const u32 append2[4], const u32 offset) -{ - switch (offset) - { - case 0: - w0[0] = append0[0]; - w0[1] = append0[1]; - w0[2] = append0[2]; - w0[3] = append0[3]; - w1[0] = append1[0]; - w1[1] = append1[1]; - w1[2] = append1[2]; - w1[3] = append1[3]; - w2[0] = append2[0]; + case 14: + w0[3] |= 0x8000; break; - case 1: - w0[0] = w0[0] | append0[0] << 8; - w0[1] = append0[0] >> 24 | append0[1] << 8; - w0[2] = append0[1] >> 24 | append0[2] << 8; - w0[3] = append0[2] >> 24 | append0[3] << 8; - w1[0] = append0[3] >> 24 | append1[0] << 8; - w1[1] = append1[0] >> 24 | append1[1] << 8; - w1[2] = append1[1] >> 24 | append1[2] << 8; - w1[3] = append1[2] >> 24 | append1[3] << 8; - w2[0] = append1[3] >> 24 | append2[0] << 8; - w2[1] = append2[0] >> 24; + case 15: + w0[3] |= 0x80; break; - case 2: - w0[0] = w0[0] | append0[0] << 16; - w0[1] = append0[0] >> 16 | append0[1] << 16; - w0[2] = append0[1] >> 16 | append0[2] << 16; - w0[3] = append0[2] >> 16 | append0[3] << 16; - w1[0] = append0[3] >> 16 | append1[0] << 16; - w1[1] = append1[0] >> 16 | append1[1] << 16; - w1[2] = append1[1] >> 16 | append1[2] << 16; - w1[3] = append1[2] >> 16 | append1[3] << 16; - w2[0] = append1[3] >> 16 | append2[0] << 16; - w2[1] = append2[0] >> 16; + case 16: + w1[0] |= 0x80000000; break; - - case 3: - w0[0] = w0[0] | append0[0] << 24; - w0[1] = append0[0] >> 8 | append0[1] << 24; - w0[2] = append0[1] >> 8 | append0[2] << 24; - w0[3] = append0[2] >> 8 | append0[3] << 24; - w1[0] = append0[3] >> 8 | append1[0] << 24; - w1[1] = append1[0] >> 8 | append1[1] << 24; - w1[2] = append1[1] >> 8 | append1[2] << 24; - w1[3] = append1[2] >> 8 | append1[3] << 24; - w2[0] = append1[3] >> 8 | append2[0] << 24; - w2[1] = append2[0] >> 8; + + case 17: + w1[0] |= 0x800000; break; - case 4: - w0[1] = append0[0]; - w0[2] = append0[1]; - w0[3] = append0[2]; - w1[0] = append0[3]; - w1[1] = append1[0]; - w1[2] = append1[1]; - w1[3] = append1[2]; - w2[0] = append1[3]; - w2[1] = append2[0]; + case 18: + w1[0] |= 0x8000; break; - case 5: - w0[1] = w0[1] | append0[0] << 8; - w0[2] = append0[0] >> 24 | append0[1] << 8; - w0[3] = append0[1] >> 24 | append0[2] << 8; - w1[0] = append0[2] >> 24 | append0[3] << 8; - w1[1] = append0[3] >> 24 | append1[0] << 8; - w1[2] = append1[0] >> 24 | append1[1] << 8; - w1[3] = append1[1] >> 24 | append1[2] << 8; - w2[0] = append1[2] >> 24 | append1[3] << 8; - w2[1] = append1[3] >> 24 | append2[0] << 8; - w2[2] = append2[0] >> 24; + case 19: + w1[0] |= 0x80; break; - case 6: - w0[1] = w0[1] | append0[0] << 16; - w0[2] = append0[0] >> 16 | append0[1] << 16; - w0[3] = append0[1] >> 16 | append0[2] << 16; - w1[0] = append0[2] >> 16 | append0[3] << 16; - w1[1] = append0[3] >> 16 | append1[0] << 16; - w1[2] = append1[0] >> 16 | append1[1] << 16; - w1[3] = append1[1] >> 16 | append1[2] << 16; - w2[0] = append1[2] >> 16 | append1[3] << 16; - w2[1] = append1[3] >> 16 | append2[0] << 16; - w2[2] = append2[0] >> 16; + case 20: + w1[1] |= 0x80000000; break; - case 7: - w0[1] = w0[1] | append0[0] << 24; - w0[2] = append0[0] >> 8 | append0[1] << 24; - w0[3] = append0[1] >> 8 | append0[2] << 24; - w1[0] = append0[2] >> 8 | append0[3] << 24; - w1[1] = append0[3] >> 8 | append1[0] << 24; - w1[2] = append1[0] >> 8 | append1[1] << 24; - w1[3] = append1[1] >> 8 | append1[2] << 24; - w2[0] = append1[2] >> 8 | append1[3] << 24; - w2[1] = append1[3] >> 8 | append2[0] << 24; - w2[2] = append2[0] >> 8; + case 21: + w1[1] |= 0x800000; break; - case 8: - w0[2] = append0[0]; - w0[3] = append0[1]; - w1[0] = append0[2]; - w1[1] = append0[3]; - w1[2] = append1[0]; - w1[3] = append1[1]; - w2[0] = append1[2]; - w2[1] = append1[3]; - w2[2] = append2[0]; + case 22: + w1[1] |= 0x8000; break; - case 9: - w0[2] = w0[2] | append0[0] << 8; - w0[3] = append0[0] >> 24 | append0[1] << 8; - w1[0] = append0[1] >> 24 | append0[2] << 8; - w1[1] = append0[2] >> 24 | append0[3] << 8; - w1[2] = append0[3] >> 24 | append1[0] << 8; - w1[3] = append1[0] >> 24 | append1[1] << 8; - w2[0] = append1[1] >> 24 | append1[2] << 8; - w2[1] = append1[2] >> 24 | append1[3] << 8; - w2[2] = append1[3] >> 24 | append2[0] << 8; - w2[3] = append2[0] >> 24; + case 23: + w1[1] |= 0x80; break; - case 10: - w0[2] = w0[2] | append0[0] << 16; - w0[3] = append0[0] >> 16 | append0[1] << 16; - w1[0] = append0[1] >> 16 | append0[2] << 16; - w1[1] = append0[2] >> 16 | append0[3] << 16; - w1[2] = append0[3] >> 16 | append1[0] << 16; - w1[3] = append1[0] >> 16 | append1[1] << 16; - w2[0] = append1[1] >> 16 | append1[2] << 16; - w2[1] = append1[2] >> 16 | append1[3] << 16; - w2[2] = append1[3] >> 16 | append2[0] << 16; - w2[3] = append2[0] >> 16; + case 24: + w1[2] |= 0x80000000; break; - case 11: - w0[2] = w0[2] | append0[0] << 24; - w0[3] = append0[0] >> 8 | append0[1] << 24; - w1[0] = append0[1] >> 8 | append0[2] << 24; - w1[1] = append0[2] >> 8 | append0[3] << 24; - w1[2] = append0[3] >> 8 | append1[0] << 24; - w1[3] = append1[0] >> 8 | append1[1] << 24; - w2[0] = append1[1] >> 8 | append1[2] << 24; - w2[1] = append1[2] >> 8 | append1[3] << 24; - w2[2] = append1[3] >> 8 | append2[0] << 24; - w2[3] = append2[0] >> 8; + case 25: + w1[2] |= 0x800000; break; - case 12: - w0[3] = append0[0]; - w1[0] = append0[1]; - w1[1] = append0[2]; - w1[2] = append0[3]; - w1[3] = append1[0]; - w2[0] = append1[1]; - w2[1] = append1[2]; - w2[2] = append1[3]; - w2[3] = append2[0]; + case 26: + w1[2] |= 0x8000; break; - case 13: - w0[3] = w0[3] | append0[0] << 8; - w1[0] = append0[0] >> 24 | append0[1] << 8; - w1[1] = append0[1] >> 24 | append0[2] << 8; - w1[2] = append0[2] >> 24 | append0[3] << 8; - w1[3] = append0[3] >> 24 | append1[0] << 8; - w2[0] = append1[0] >> 24 | append1[1] << 8; - w2[1] = append1[1] >> 24 | append1[2] << 8; - w2[2] = append1[2] >> 24 | append1[3] << 8; - w2[3] = append1[3] >> 24 | append2[0] << 8; - w3[0] = append2[0] >> 24; + case 27: + w1[2] |= 0x80; break; - case 14: - w0[3] = w0[3] | append0[0] << 16; - w1[0] = append0[0] >> 16 | append0[1] << 16; - w1[1] = append0[1] >> 16 | append0[2] << 16; - w1[2] = append0[2] >> 16 | append0[3] << 16; - w1[3] = append0[3] >> 16 | append1[0] << 16; - w2[0] = append1[0] >> 16 | append1[1] << 16; - w2[1] = append1[1] >> 16 | append1[2] << 16; - w2[2] = append1[2] >> 16 | append1[3] << 16; - w2[3] = append1[3] >> 16 | append2[0] << 16; - w3[0] = append2[0] >> 16; + case 28: + w1[3] |= 0x80000000; break; - case 15: - w0[3] = w0[3] | append0[0] << 24; - w1[0] = append0[0] >> 8 | append0[1] << 24; - w1[1] = append0[1] >> 8 | append0[2] << 24; - w1[2] = append0[2] >> 8 | append0[3] << 24; - w1[3] = append0[3] >> 8 | append1[0] << 24; - w2[0] = append1[0] >> 8 | append1[1] << 24; - w2[1] = append1[1] >> 8 | append1[2] << 24; - w2[2] = append1[2] >> 8 | append1[3] << 24; - w2[3] = append1[3] >> 8 | append2[0] << 24; - w3[0] = append2[0] >> 8; + case 29: + w1[3] |= 0x800000; + break; + + case 30: + w1[3] |= 0x8000; + break; + + case 31: + w1[3] |= 0x80; break; } } -// before: memcat32_8 -static void memcat_c32_w4x4_a2x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 append0[4], const u32 append1[4], const u32 offset) +// before: append_0x80_4 +static void append_0x80_1x16 (u32 w[16], const u32 offset) { switch (offset) { case 0: - w0[0] = append0[0]; - w0[1] = append0[1]; - w0[2] = append0[2]; - w0[3] = append0[3]; - w1[0] = append1[0]; - w1[1] = append1[1]; - w1[2] = append1[2]; - w1[3] = append1[3]; + w[ 0] = 0x80; break; case 1: - w0[0] = w0[0] | append0[0] << 8; - w0[1] = append0[0] >> 24 | append0[1] << 8; - w0[2] = append0[1] >> 24 | append0[2] << 8; - w0[3] = append0[2] >> 24 | append0[3] << 8; - w1[0] = append0[3] >> 24 | append1[0] << 8; - w1[1] = append1[0] >> 24 | append1[1] << 8; - w1[2] = append1[1] >> 24 | append1[2] << 8; - w1[3] = append1[2] >> 24 | append1[3] << 8; - w2[0] = append1[3] >> 24; + w[ 0] = w[ 0] | 0x8000; break; case 2: - w0[0] = w0[0] | append0[0] << 16; - w0[1] = append0[0] >> 16 | append0[1] << 16; - w0[2] = append0[1] >> 16 | append0[2] << 16; - w0[3] = append0[2] >> 16 | append0[3] << 16; - w1[0] = append0[3] >> 16 | append1[0] << 16; - w1[1] = append1[0] >> 16 | append1[1] << 16; - w1[2] = append1[1] >> 16 | append1[2] << 16; - w1[3] = append1[2] >> 16 | append1[3] << 16; - w2[0] = append1[3] >> 16; + w[ 0] = w[ 0] | 0x800000; break; case 3: - w0[0] = w0[0] | append0[0] << 24; - w0[1] = append0[0] >> 8 | append0[1] << 24; - w0[2] = append0[1] >> 8 | append0[2] << 24; - w0[3] = append0[2] >> 8 | append0[3] << 24; - w1[0] = append0[3] >> 8 | append1[0] << 24; - w1[1] = append1[0] >> 8 | append1[1] << 24; - w1[2] = append1[1] >> 8 | append1[2] << 24; - w1[3] = append1[2] >> 8 | append1[3] << 24; - w2[0] = append1[3] >> 8; + w[ 0] = w[ 0] | 0x80000000; break; case 4: - w0[1] = append0[0]; - w0[2] = append0[1]; - w0[3] = append0[2]; - w1[0] = append0[3]; - w1[1] = append1[0]; - w1[2] = append1[1]; - w1[3] = append1[2]; - w2[0] = append1[3]; + w[ 1] = 0x80; break; case 5: - w0[1] = w0[1] | append0[0] << 8; - w0[2] = append0[0] >> 24 | append0[1] << 8; - w0[3] = append0[1] >> 24 | append0[2] << 8; - w1[0] = append0[2] >> 24 | append0[3] << 8; - w1[1] = append0[3] >> 24 | append1[0] << 8; - w1[2] = append1[0] >> 24 | append1[1] << 8; - w1[3] = append1[1] >> 24 | append1[2] << 8; - w2[0] = append1[2] >> 24 | append1[3] << 8; - w2[1] = append1[3] >> 24; + w[ 1] = w[ 1] | 0x8000; break; case 6: - w0[1] = w0[1] | append0[0] << 16; - w0[2] = append0[0] >> 16 | append0[1] << 16; - w0[3] = append0[1] >> 16 | append0[2] << 16; - w1[0] = append0[2] >> 16 | append0[3] << 16; - w1[1] = append0[3] >> 16 | append1[0] << 16; - w1[2] = append1[0] >> 16 | append1[1] << 16; - w1[3] = append1[1] >> 16 | append1[2] << 16; - w2[0] = append1[2] >> 16 | append1[3] << 16; - w2[1] = append1[3] >> 16; + w[ 1] = w[ 1] | 0x800000; break; case 7: - w0[1] = w0[1] | append0[0] << 24; - w0[2] = append0[0] >> 8 | append0[1] << 24; - w0[3] = append0[1] >> 8 | append0[2] << 24; - w1[0] = append0[2] >> 8 | append0[3] << 24; - w1[1] = append0[3] >> 8 | append1[0] << 24; - w1[2] = append1[0] >> 8 | append1[1] << 24; - w1[3] = append1[1] >> 8 | append1[2] << 24; - w2[0] = append1[2] >> 8 | append1[3] << 24; - w2[1] = append1[3] >> 8; - break; - - case 8: - w0[2] = append0[0]; - w0[3] = append0[1]; - w1[0] = append0[2]; - w1[1] = append0[3]; - w1[2] = append1[0]; - w1[3] = append1[1]; - w2[0] = append1[2]; - w2[1] = append1[3]; + w[ 1] = w[ 1] | 0x80000000; + break; + + case 8: + w[ 2] = 0x80; break; case 9: - w0[2] = w0[2] | append0[0] << 8; - w0[3] = append0[0] >> 24 | append0[1] << 8; - w1[0] = append0[1] >> 24 | append0[2] << 8; - w1[1] = append0[2] >> 24 | append0[3] << 8; - w1[2] = append0[3] >> 24 | append1[0] << 8; - w1[3] = append1[0] >> 24 | append1[1] << 8; - w2[0] = append1[1] >> 24 | append1[2] << 8; - w2[1] = append1[2] >> 24 | append1[3] << 8; - w2[2] = append1[3] >> 24; + w[ 2] = w[ 2] | 0x8000; break; case 10: - w0[2] = w0[2] | append0[0] << 16; - w0[3] = append0[0] >> 16 | append0[1] << 16; - w1[0] = append0[1] >> 16 | append0[2] << 16; - w1[1] = append0[2] >> 16 | append0[3] << 16; - w1[2] = append0[3] >> 16 | append1[0] << 16; - w1[3] = append1[0] >> 16 | append1[1] << 16; - w2[0] = append1[1] >> 16 | append1[2] << 16; - w2[1] = append1[2] >> 16 | append1[3] << 16; - w2[2] = append1[3] >> 16; + w[ 2] = w[ 2] | 0x800000; break; case 11: - w0[2] = w0[2] | append0[0] << 24; - w0[3] = append0[0] >> 8 | append0[1] << 24; - w1[0] = append0[1] >> 8 | append0[2] << 24; - w1[1] = append0[2] >> 8 | append0[3] << 24; - w1[2] = append0[3] >> 8 | append1[0] << 24; - w1[3] = append1[0] >> 8 | append1[1] << 24; - w2[0] = append1[1] >> 8 | append1[2] << 24; - w2[1] = append1[2] >> 8 | append1[3] << 24; - w2[2] = append1[3] >> 8; + w[ 2] = w[ 2] | 0x80000000; break; case 12: - w0[3] = append0[0]; - w1[0] = append0[1]; - w1[1] = append0[2]; - w1[2] = append0[3]; - w1[3] = append1[0]; - w2[0] = append1[1]; - w2[1] = append1[2]; - w2[2] = append1[3]; + w[ 3] = 0x80; break; case 13: - w0[3] = w0[3] | append0[0] << 8; - w1[0] = append0[0] >> 24 | append0[1] << 8; - w1[1] = append0[1] >> 24 | append0[2] << 8; - w1[2] = append0[2] >> 24 | append0[3] << 8; - w1[3] = append0[3] >> 24 | append1[0] << 8; - w2[0] = append1[0] >> 24 | append1[1] << 8; - w2[1] = append1[1] >> 24 | append1[2] << 8; - w2[2] = append1[2] >> 24 | append1[3] << 8; - w2[3] = append1[3] >> 24; + w[ 3] = w[ 3] | 0x8000; break; case 14: - w0[3] = w0[3] | append0[0] << 16; - w1[0] = append0[0] >> 16 | append0[1] << 16; - w1[1] = append0[1] >> 16 | append0[2] << 16; - w1[2] = append0[2] >> 16 | append0[3] << 16; - w1[3] = append0[3] >> 16 | append1[0] << 16; - w2[0] = append1[0] >> 16 | append1[1] << 16; - w2[1] = append1[1] >> 16 | append1[2] << 16; - w2[2] = append1[2] >> 16 | append1[3] << 16; - w2[3] = append1[3] >> 16; + w[ 3] = w[ 3] | 0x800000; break; case 15: - w0[3] = w0[3] | append0[0] << 24; - w1[0] = append0[0] >> 8 | append0[1] << 24; - w1[1] = append0[1] >> 8 | append0[2] << 24; - w1[2] = append0[2] >> 8 | append0[3] << 24; - w1[3] = append0[3] >> 8 | append1[0] << 24; - w2[0] = append1[0] >> 8 | append1[1] << 24; - w2[1] = append1[1] >> 8 | append1[2] << 24; - w2[2] = append1[2] >> 8 | append1[3] << 24; - w2[3] = append1[3] >> 8; + w[ 3] = w[ 3] | 0x80000000; break; case 16: - w1[0] = append0[0]; - w1[1] = append0[1]; - w1[2] = append0[2]; - w1[3] = append0[3]; - w2[0] = append1[0]; - w2[1] = append1[1]; - w2[2] = append1[2]; - w2[3] = append1[3]; + w[ 4] = 0x80; break; case 17: - w1[0] = w1[0] | append0[0] << 8; - w1[1] = append0[0] >> 24 | append0[1] << 8; - w1[2] = append0[1] >> 24 | append0[2] << 8; - w1[3] = append0[2] >> 24 | append0[3] << 8; - w2[0] = append0[3] >> 24 | append1[0] << 8; - w2[1] = append1[0] >> 24 | append1[1] << 8; - w2[2] = append1[1] >> 24 | append1[2] << 8; - w2[3] = append1[2] >> 24 | append1[3] << 8; - w3[0] = append1[3] >> 24; + w[ 4] = w[ 4] | 0x8000; break; case 18: - w1[0] = w1[0] | append0[0] << 16; - w1[1] = append0[0] >> 16 | append0[1] << 16; - w1[2] = append0[1] >> 16 | append0[2] << 16; - w1[3] = append0[2] >> 16 | append0[3] << 16; - w2[0] = append0[3] >> 16 | append1[0] << 16; - w2[1] = append1[0] >> 16 | append1[1] << 16; - w2[2] = append1[1] >> 16 | append1[2] << 16; - w2[3] = append1[2] >> 16 | append1[3] << 16; - w3[0] = append1[3] >> 16; + w[ 4] = w[ 4] | 0x800000; break; case 19: - w1[0] = w1[0] | append0[0] << 24; - w1[1] = append0[0] >> 8 | append0[1] << 24; - w1[2] = append0[1] >> 8 | append0[2] << 24; - w1[3] = append0[2] >> 8 | append0[3] << 24; - w2[0] = append0[3] >> 8 | append1[0] << 24; - w2[1] = append1[0] >> 8 | append1[1] << 24; - w2[2] = append1[1] >> 8 | append1[2] << 24; - w2[3] = append1[2] >> 8 | append1[3] << 24; - w3[0] = append1[3] >> 8; + w[ 4] = w[ 4] | 0x80000000; break; case 20: - w1[1] = append0[0]; - w1[2] = append0[1]; - w1[3] = append0[2]; - w2[0] = append0[3]; - w2[1] = append1[0]; - w2[2] = append1[1]; - w2[3] = append1[2]; - w3[0] = append1[3]; + w[ 5] = 0x80; break; case 21: - w1[1] = w1[1] | append0[0] << 8; - w1[2] = append0[0] >> 24 | append0[1] << 8; - w1[3] = append0[1] >> 24 | append0[2] << 8; - w2[0] = append0[2] >> 24 | append0[3] << 8; - w2[1] = append0[3] >> 24 | append1[0] << 8; - w2[2] = append1[0] >> 24 | append1[1] << 8; - w2[3] = append1[1] >> 24 | append1[2] << 8; - w3[0] = append1[2] >> 24 | append1[3] << 8; - w3[1] = append1[3] >> 24; + w[ 5] = w[ 5] | 0x8000; break; case 22: - w1[1] = w1[1] | append0[0] << 16; - w1[2] = append0[0] >> 16 | append0[1] << 16; - w1[3] = append0[1] >> 16 | append0[2] << 16; - w2[0] = append0[2] >> 16 | append0[3] << 16; - w2[1] = append0[3] >> 16 | append1[0] << 16; - w2[2] = append1[0] >> 16 | append1[1] << 16; - w2[3] = append1[1] >> 16 | append1[2] << 16; - w3[0] = append1[2] >> 16 | append1[3] << 16; - w3[1] = append1[3] >> 16; + w[ 5] = w[ 5] | 0x800000; break; case 23: - w1[1] = w1[1] | append0[0] << 24; - w1[2] = append0[0] >> 8 | append0[1] << 24; - w1[3] = append0[1] >> 8 | append0[2] << 24; - w2[0] = append0[2] >> 8 | append0[3] << 24; - w2[1] = append0[3] >> 8 | append1[0] << 24; - w2[2] = append1[0] >> 8 | append1[1] << 24; - w2[3] = append1[1] >> 8 | append1[2] << 24; - w3[0] = append1[2] >> 8 | append1[3] << 24; - w3[1] = append1[3] >> 8; + w[ 5] = w[ 5] | 0x80000000; break; case 24: - w1[2] = append0[0]; - w1[3] = append0[1]; - w2[0] = append0[2]; - w2[1] = append0[3]; - w2[2] = append1[0]; - w2[3] = append1[1]; - w3[0] = append1[2]; - w3[1] = append1[3]; + w[ 6] = 0x80; break; case 25: - w1[2] = w1[2] | append0[0] << 8; - w1[3] = append0[0] >> 24 | append0[1] << 8; - w2[0] = append0[1] >> 24 | append0[2] << 8; - w2[1] = append0[2] >> 24 | append0[3] << 8; - w2[2] = append0[3] >> 24 | append1[0] << 8; - w2[3] = append1[0] >> 24 | append1[1] << 8; - w3[0] = append1[1] >> 24 | append1[2] << 8; - w3[1] = append1[2] >> 24 | append1[3] << 8; + w[ 6] = w[ 6] | 0x8000; break; case 26: - w1[2] = w1[2] | append0[0] << 16; - w1[3] = append0[0] >> 16 | append0[1] << 16; - w2[0] = append0[1] >> 16 | append0[2] << 16; - w2[1] = append0[2] >> 16 | append0[3] << 16; - w2[2] = append0[3] >> 16 | append1[0] << 16; - w2[3] = append1[0] >> 16 | append1[1] << 16; - w3[0] = append1[1] >> 16 | append1[2] << 16; - w3[1] = append1[2] >> 16 | append1[3] << 16; + w[ 6] = w[ 6] | 0x800000; break; case 27: - w1[2] = w1[2] | append0[0] << 24; - w1[3] = append0[0] >> 8 | append0[1] << 24; - w2[0] = append0[1] >> 8 | append0[2] << 24; - w2[1] = append0[2] >> 8 | append0[3] << 24; - w2[2] = append0[3] >> 8 | append1[0] << 24; - w2[3] = append1[0] >> 8 | append1[1] << 24; - w3[0] = append1[1] >> 8 | append1[2] << 24; - w3[1] = append1[2] >> 8 | append1[3] << 24; + w[ 6] = w[ 6] | 0x80000000; break; case 28: - w1[3] = append0[0]; - w2[0] = append0[1]; - w2[1] = append0[2]; - w2[2] = append0[3]; - w2[3] = append1[0]; - w3[0] = append1[1]; - w3[1] = append1[2]; + w[ 7] = 0x80; break; case 29: - w1[3] = w1[3] | append0[0] << 8; - w2[0] = append0[0] >> 24 | append0[1] << 8; - w2[1] = append0[1] >> 24 | append0[2] << 8; - w2[2] = append0[2] >> 24 | append0[3] << 8; - w2[3] = append0[3] >> 24 | append1[0] << 8; - w3[0] = append1[0] >> 24 | append1[1] << 8; - w3[1] = append1[1] >> 24 | append1[2] << 8; + w[ 7] = w[ 7] | 0x8000; break; case 30: - w1[3] = w1[3] | append0[0] << 16; - w2[0] = append0[0] >> 16 | append0[1] << 16; - w2[1] = append0[1] >> 16 | append0[2] << 16; - w2[2] = append0[2] >> 16 | append0[3] << 16; - w2[3] = append0[3] >> 16 | append1[0] << 16; - w3[0] = append1[0] >> 16 | append1[1] << 16; - w3[1] = append1[1] >> 16 | append1[2] << 16; + w[ 7] = w[ 7] | 0x800000; + break; + + case 31: + w[ 7] = w[ 7] | 0x80000000; + break; + + case 32: + w[ 8] = 0x80; + break; + + case 33: + w[ 8] = w[ 8] | 0x8000; + break; + + case 34: + w[ 8] = w[ 8] | 0x800000; + break; + + case 35: + w[ 8] = w[ 8] | 0x80000000; + break; + + case 36: + w[ 9] = 0x80; + break; + + case 37: + w[ 9] = w[ 9] | 0x8000; + break; + + case 38: + w[ 9] = w[ 9] | 0x800000; + break; + + case 39: + w[ 9] = w[ 9] | 0x80000000; + break; + + case 40: + w[10] = 0x80; + break; + + case 41: + w[10] = w[10] | 0x8000; break; - case 31: - w1[3] = w1[3] | append0[0] << 24; - w2[0] = append0[0] >> 8 | append0[1] << 24; - w2[1] = append0[1] >> 8 | append0[2] << 24; - w2[2] = append0[2] >> 8 | append0[3] << 24; - w2[3] = append0[3] >> 8 | append1[0] << 24; - w3[0] = append1[0] >> 8 | append1[1] << 24; - w3[1] = append1[1] >> 8 | append1[2] << 24; + case 42: + w[10] = w[10] | 0x800000; break; - case 32: - w2[0] = append0[0]; - w2[1] = append0[1]; - w2[2] = append0[2]; - w2[3] = append0[3]; - w3[0] = append1[0]; - w3[1] = append1[1]; + case 43: + w[10] = w[10] | 0x80000000; break; - } -} -// before: memcat32_9 -static void memcat_c32_w4x4_a3x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 append0[4], const u32 append1[4], const u32 append2[4], const u32 offset) -{ - switch (offset) - { - case 0: - w0[0] = append0[0]; - w0[1] = append0[1]; - w0[2] = append0[2]; - w0[3] = append0[3]; - w1[0] = append1[0]; - w1[1] = append1[1]; - w1[2] = append1[2]; - w1[3] = append1[3]; - w2[0] = append2[0]; + case 44: + w[11] = 0x80; break; - case 1: - w0[0] = w0[0] | append0[0] << 8; - w0[1] = append0[0] >> 24 | append0[1] << 8; - w0[2] = append0[1] >> 24 | append0[2] << 8; - w0[3] = append0[2] >> 24 | append0[3] << 8; - w1[0] = append0[3] >> 24 | append1[0] << 8; - w1[1] = append1[0] >> 24 | append1[1] << 8; - w1[2] = append1[1] >> 24 | append1[2] << 8; - w1[3] = append1[2] >> 24 | append1[3] << 8; - w2[0] = append1[3] >> 24 | append2[0] << 8; - w2[1] = append2[0] >> 24; + case 45: + w[11] = w[11] | 0x8000; break; - case 2: - w0[0] = w0[0] | append0[0] << 16; - w0[1] = append0[0] >> 16 | append0[1] << 16; - w0[2] = append0[1] >> 16 | append0[2] << 16; - w0[3] = append0[2] >> 16 | append0[3] << 16; - w1[0] = append0[3] >> 16 | append1[0] << 16; - w1[1] = append1[0] >> 16 | append1[1] << 16; - w1[2] = append1[1] >> 16 | append1[2] << 16; - w1[3] = append1[2] >> 16 | append1[3] << 16; - w2[0] = append1[3] >> 16 | append2[0] << 16; - w2[1] = append2[0] >> 16; + case 46: + w[11] = w[11] | 0x800000; break; - case 3: - w0[0] = w0[0] | append0[0] << 24; - w0[1] = append0[0] >> 8 | append0[1] << 24; - w0[2] = append0[1] >> 8 | append0[2] << 24; - w0[3] = append0[2] >> 8 | append0[3] << 24; - w1[0] = append0[3] >> 8 | append1[0] << 24; - w1[1] = append1[0] >> 8 | append1[1] << 24; - w1[2] = append1[1] >> 8 | append1[2] << 24; - w1[3] = append1[2] >> 8 | append1[3] << 24; - w2[0] = append1[3] >> 8 | append2[0] << 24; - w2[1] = append2[0] >> 8; + case 47: + w[11] = w[11] | 0x80000000; break; - case 4: - w0[1] = append0[0]; - w0[2] = append0[1]; - w0[3] = append0[2]; - w1[0] = append0[3]; - w1[1] = append1[0]; - w1[2] = append1[1]; - w1[3] = append1[2]; - w2[0] = append1[3]; - w2[1] = append2[0]; + case 48: + w[12] = 0x80; break; - case 5: - w0[1] = w0[1] | append0[0] << 8; - w0[2] = append0[0] >> 24 | append0[1] << 8; - w0[3] = append0[1] >> 24 | append0[2] << 8; - w1[0] = append0[2] >> 24 | append0[3] << 8; - w1[1] = append0[3] >> 24 | append1[0] << 8; - w1[2] = append1[0] >> 24 | append1[1] << 8; - w1[3] = append1[1] >> 24 | append1[2] << 8; - w2[0] = append1[2] >> 24 | append1[3] << 8; - w2[1] = append1[3] >> 24 | append2[0] << 8; - w2[2] = append2[0] >> 24; + case 49: + w[12] = w[12] | 0x8000; break; - case 6: - w0[1] = w0[1] | append0[0] << 16; - w0[2] = append0[0] >> 16 | append0[1] << 16; - w0[3] = append0[1] >> 16 | append0[2] << 16; - w1[0] = append0[2] >> 16 | append0[3] << 16; - w1[1] = append0[3] >> 16 | append1[0] << 16; - w1[2] = append1[0] >> 16 | append1[1] << 16; - w1[3] = append1[1] >> 16 | append1[2] << 16; - w2[0] = append1[2] >> 16 | append1[3] << 16; - w2[1] = append1[3] >> 16 | append2[0] << 16; - w2[2] = append2[0] >> 16; + case 50: + w[12] = w[12] | 0x800000; break; - case 7: - w0[1] = w0[1] | append0[0] << 24; - w0[2] = append0[0] >> 8 | append0[1] << 24; - w0[3] = append0[1] >> 8 | append0[2] << 24; - w1[0] = append0[2] >> 8 | append0[3] << 24; - w1[1] = append0[3] >> 8 | append1[0] << 24; - w1[2] = append1[0] >> 8 | append1[1] << 24; - w1[3] = append1[1] >> 8 | append1[2] << 24; - w2[0] = append1[2] >> 8 | append1[3] << 24; - w2[1] = append1[3] >> 8 | append2[0] << 24; - w2[2] = append2[0] >> 8; + case 51: + w[12] = w[12] | 0x80000000; break; - case 8: - w0[2] = append0[0]; - w0[3] = append0[1]; - w1[0] = append0[2]; - w1[1] = append0[3]; - w1[2] = append1[0]; - w1[3] = append1[1]; - w2[0] = append1[2]; - w2[1] = append1[3]; - w2[2] = append2[0]; + case 52: + w[13] = 0x80; break; - case 9: - w0[2] = w0[2] | append0[0] << 8; - w0[3] = append0[0] >> 24 | append0[1] << 8; - w1[0] = append0[1] >> 24 | append0[2] << 8; - w1[1] = append0[2] >> 24 | append0[3] << 8; - w1[2] = append0[3] >> 24 | append1[0] << 8; - w1[3] = append1[0] >> 24 | append1[1] << 8; - w2[0] = append1[1] >> 24 | append1[2] << 8; - w2[1] = append1[2] >> 24 | append1[3] << 8; - w2[2] = append1[3] >> 24 | append2[0] << 8; - w2[3] = append2[0] >> 24; + case 53: + w[13] = w[13] | 0x8000; break; - case 10: - w0[2] = w0[2] | append0[0] << 16; - w0[3] = append0[0] >> 16 | append0[1] << 16; - w1[0] = append0[1] >> 16 | append0[2] << 16; - w1[1] = append0[2] >> 16 | append0[3] << 16; - w1[2] = append0[3] >> 16 | append1[0] << 16; - w1[3] = append1[0] >> 16 | append1[1] << 16; - w2[0] = append1[1] >> 16 | append1[2] << 16; - w2[1] = append1[2] >> 16 | append1[3] << 16; - w2[2] = append1[3] >> 16 | append2[0] << 16; - w2[3] = append2[0] >> 16; + case 54: + w[13] = w[13] | 0x800000; break; - case 11: - w0[2] = w0[2] | append0[0] << 24; - w0[3] = append0[0] >> 8 | append0[1] << 24; - w1[0] = append0[1] >> 8 | append0[2] << 24; - w1[1] = append0[2] >> 8 | append0[3] << 24; - w1[2] = append0[3] >> 8 | append1[0] << 24; - w1[3] = append1[0] >> 8 | append1[1] << 24; - w2[0] = append1[1] >> 8 | append1[2] << 24; - w2[1] = append1[2] >> 8 | append1[3] << 24; - w2[2] = append1[3] >> 8 | append2[0] << 24; - w2[3] = append2[0] >> 8; + case 55: + w[13] = w[13] | 0x80000000; break; - case 12: - w0[3] = append0[0]; - w1[0] = append0[1]; - w1[1] = append0[2]; - w1[2] = append0[3]; - w1[3] = append1[0]; - w2[0] = append1[1]; - w2[1] = append1[2]; - w2[2] = append1[3]; - w2[3] = append2[0]; + case 56: + w[14] = 0x80; break; - case 13: - w0[3] = w0[3] | append0[0] << 8; - w1[0] = append0[0] >> 24 | append0[1] << 8; - w1[1] = append0[1] >> 24 | append0[2] << 8; - w1[2] = append0[2] >> 24 | append0[3] << 8; - w1[3] = append0[3] >> 24 | append1[0] << 8; - w2[0] = append1[0] >> 24 | append1[1] << 8; - w2[1] = append1[1] >> 24 | append1[2] << 8; - w2[2] = append1[2] >> 24 | append1[3] << 8; - w2[3] = append1[3] >> 24 | append2[0] << 8; - w3[0] = append2[0] >> 24; + case 57: + w[14] = w[14] | 0x8000; break; - case 14: - w0[3] = w0[3] | append0[0] << 16; - w1[0] = append0[0] >> 16 | append0[1] << 16; - w1[1] = append0[1] >> 16 | append0[2] << 16; - w1[2] = append0[2] >> 16 | append0[3] << 16; - w1[3] = append0[3] >> 16 | append1[0] << 16; - w2[0] = append1[0] >> 16 | append1[1] << 16; - w2[1] = append1[1] >> 16 | append1[2] << 16; - w2[2] = append1[2] >> 16 | append1[3] << 16; - w2[3] = append1[3] >> 16 | append2[0] << 16; - w3[0] = append2[0] >> 16; + case 58: + w[14] = w[14] | 0x800000; break; - case 15: - w0[3] = w0[3] | append0[0] << 24; - w1[0] = append0[0] >> 8 | append0[1] << 24; - w1[1] = append0[1] >> 8 | append0[2] << 24; - w1[2] = append0[2] >> 8 | append0[3] << 24; - w1[3] = append0[3] >> 8 | append1[0] << 24; - w2[0] = append1[0] >> 8 | append1[1] << 24; - w2[1] = append1[1] >> 8 | append1[2] << 24; - w2[2] = append1[2] >> 8 | append1[3] << 24; - w2[3] = append1[3] >> 8 | append2[0] << 24; - w3[0] = append2[0] >> 8; + case 59: + w[14] = w[14] | 0x80000000; break; - case 16: - w1[0] = append0[0]; - w1[1] = append0[1]; - w1[2] = append0[2]; - w1[3] = append0[3]; - w2[0] = append1[0]; - w2[1] = append1[1]; - w2[2] = append1[2]; - w2[3] = append1[3]; - w3[0] = append2[0]; + case 60: + w[15] = 0x80; break; - case 17: - w1[0] = w1[0] | append0[0] << 8; - w1[1] = append0[0] >> 24 | append0[1] << 8; - w1[2] = append0[1] >> 24 | append0[2] << 8; - w1[3] = append0[2] >> 24 | append0[3] << 8; - w2[0] = append0[3] >> 24 | append1[0] << 8; - w2[1] = append1[0] >> 24 | append1[1] << 8; - w2[2] = append1[1] >> 24 | append1[2] << 8; - w2[3] = append1[2] >> 24 | append1[3] << 8; - w3[0] = append1[3] >> 24 | append2[0] << 8; - w3[1] = append2[0] >> 24; + case 61: + w[15] = w[15] | 0x8000; + break; + + case 62: + w[15] = w[15] | 0x800000; break; - case 18: - w1[0] = w1[0] | append0[0] << 16; - w1[1] = append0[0] >> 16 | append0[1] << 16; - w1[2] = append0[1] >> 16 | append0[2] << 16; - w1[3] = append0[2] >> 16 | append0[3] << 16; - w2[0] = append0[3] >> 16 | append1[0] << 16; - w2[1] = append1[0] >> 16 | append1[1] << 16; - w2[2] = append1[1] >> 16 | append1[2] << 16; - w2[3] = append1[2] >> 16 | append1[3] << 16; - w3[0] = append1[3] >> 16 | append2[0] << 16; - w3[1] = append2[0] >> 16; + case 63: + w[15] = w[15] | 0x80000000; break; + } +} - case 19: - w1[0] = w1[0] | append0[0] << 24; - w1[1] = append0[0] >> 8 | append0[1] << 24; - w1[2] = append0[1] >> 8 | append0[2] << 24; - w1[3] = append0[2] >> 8 | append0[3] << 24; - w2[0] = append0[3] >> 8 | append1[0] << 24; - w2[1] = append1[0] >> 8 | append1[1] << 24; - w2[2] = append1[1] >> 8 | append1[2] << 24; - w2[3] = append1[2] >> 8 | append1[3] << 24; - w3[0] = append1[3] >> 8 | append2[0] << 24; - w3[1] = append2[0] >> 8; +// before: append_0x80_8 +static void append_0x80_1x32 (u32 w[32], const u32 offset) +{ + switch (offset) + { + case 0: + w[ 0] = 0x80; break; - case 20: - w1[1] = append0[0]; - w1[2] = append0[1]; - w1[3] = append0[2]; - w2[0] = append0[3]; - w2[1] = append1[0]; - w2[2] = append1[1]; - w2[3] = append1[2]; - w3[0] = append1[3]; - w3[1] = append2[0]; + case 1: + w[ 0] = w[ 0] | 0x8000; break; - case 21: - w1[1] = w1[1] | append0[0] << 8; - w1[2] = append0[0] >> 24 | append0[1] << 8; - w1[3] = append0[1] >> 24 | append0[2] << 8; - w2[0] = append0[2] >> 24 | append0[3] << 8; - w2[1] = append0[3] >> 24 | append1[0] << 8; - w2[2] = append1[0] >> 24 | append1[1] << 8; - w2[3] = append1[1] >> 24 | append1[2] << 8; - w3[0] = append1[2] >> 24 | append1[3] << 8; - w3[1] = append1[3] >> 24 | append2[0] << 8; + case 2: + w[ 0] = w[ 0] | 0x800000; break; - case 22: - w1[1] = w1[1] | append0[0] << 16; - w1[2] = append0[0] >> 16 | append0[1] << 16; - w1[3] = append0[1] >> 16 | append0[2] << 16; - w2[0] = append0[2] >> 16 | append0[3] << 16; - w2[1] = append0[3] >> 16 | append1[0] << 16; - w2[2] = append1[0] >> 16 | append1[1] << 16; - w2[3] = append1[1] >> 16 | append1[2] << 16; - w3[0] = append1[2] >> 16 | append1[3] << 16; - w3[1] = append1[3] >> 16 | append2[0] << 16; + case 3: + w[ 0] = w[ 0] | 0x80000000; break; - case 23: - w1[1] = w1[1] | append0[0] << 24; - w1[2] = append0[0] >> 8 | append0[1] << 24; - w1[3] = append0[1] >> 8 | append0[2] << 24; - w2[0] = append0[2] >> 8 | append0[3] << 24; - w2[1] = append0[3] >> 8 | append1[0] << 24; - w2[2] = append1[0] >> 8 | append1[1] << 24; - w2[3] = append1[1] >> 8 | append1[2] << 24; - w3[0] = append1[2] >> 8 | append1[3] << 24; - w3[1] = append1[3] >> 8 | append2[0] << 24; + case 4: + w[ 1] = 0x80; break; - case 24: - w1[2] = append0[0]; - w1[3] = append0[1]; - w2[0] = append0[2]; - w2[1] = append0[3]; - w2[2] = append1[0]; - w2[3] = append1[1]; - w3[0] = append1[2]; - w3[1] = append1[3]; + case 5: + w[ 1] = w[ 1] | 0x8000; break; - case 25: - w1[2] = w1[2] | append0[0] << 8; - w1[3] = append0[0] >> 24 | append0[1] << 8; - w2[0] = append0[1] >> 24 | append0[2] << 8; - w2[1] = append0[2] >> 24 | append0[3] << 8; - w2[2] = append0[3] >> 24 | append1[0] << 8; - w2[3] = append1[0] >> 24 | append1[1] << 8; - w3[0] = append1[1] >> 24 | append1[2] << 8; - w3[1] = append1[2] >> 24 | append1[3] << 8; + case 6: + w[ 1] = w[ 1] | 0x800000; break; - case 26: - w1[2] = w1[2] | append0[0] << 16; - w1[3] = append0[0] >> 16 | append0[1] << 16; - w2[0] = append0[1] >> 16 | append0[2] << 16; - w2[1] = append0[2] >> 16 | append0[3] << 16; - w2[2] = append0[3] >> 16 | append1[0] << 16; - w2[3] = append1[0] >> 16 | append1[1] << 16; - w3[0] = append1[1] >> 16 | append1[2] << 16; - w3[1] = append1[2] >> 16 | append1[3] << 16; + case 7: + w[ 1] = w[ 1] | 0x80000000; break; - case 27: - w1[2] = w1[2] | append0[0] << 24; - w1[3] = append0[0] >> 8 | append0[1] << 24; - w2[0] = append0[1] >> 8 | append0[2] << 24; - w2[1] = append0[2] >> 8 | append0[3] << 24; - w2[2] = append0[3] >> 8 | append1[0] << 24; - w2[3] = append1[0] >> 8 | append1[1] << 24; - w3[0] = append1[1] >> 8 | append1[2] << 24; - w3[1] = append1[2] >> 8 | append1[3] << 24; + case 8: + w[ 2] = 0x80; break; - case 28: - w1[3] = append0[0]; - w2[0] = append0[1]; - w2[1] = append0[2]; - w2[2] = append0[3]; - w2[3] = append1[0]; - w3[0] = append1[1]; - w3[1] = append1[2]; + case 9: + w[ 2] = w[ 2] | 0x8000; break; - case 29: - w1[3] = w1[3] | append0[0] << 8; - w2[0] = append0[0] >> 24 | append0[1] << 8; - w2[1] = append0[1] >> 24 | append0[2] << 8; - w2[2] = append0[2] >> 24 | append0[3] << 8; - w2[3] = append0[3] >> 24 | append1[0] << 8; - w3[0] = append1[0] >> 24 | append1[1] << 8; - w3[1] = append1[1] >> 24 | append1[2] << 8; + case 10: + w[ 2] = w[ 2] | 0x800000; break; - case 30: - w1[3] = w1[3] | append0[0] << 16; - w2[0] = append0[0] >> 16 | append0[1] << 16; - w2[1] = append0[1] >> 16 | append0[2] << 16; - w2[2] = append0[2] >> 16 | append0[3] << 16; - w2[3] = append0[3] >> 16 | append1[0] << 16; - w3[0] = append1[0] >> 16 | append1[1] << 16; - w3[1] = append1[1] >> 16 | append1[2] << 16; + case 11: + w[ 2] = w[ 2] | 0x80000000; break; - case 31: - w1[3] = w1[3] | append0[0] << 24; - w2[0] = append0[0] >> 8 | append0[1] << 24; - w2[1] = append0[1] >> 8 | append0[2] << 24; - w2[2] = append0[2] >> 8 | append0[3] << 24; - w2[3] = append0[3] >> 8 | append1[0] << 24; - w3[0] = append1[0] >> 8 | append1[1] << 24; - w3[1] = append1[1] >> 8 | append1[2] << 24; + case 12: + w[ 3] = 0x80; break; - case 32: - w2[0] = append0[0]; - w2[1] = append0[1]; - w2[2] = append0[2]; - w2[3] = append0[3]; - w3[0] = append1[0]; - w3[1] = append1[1]; + case 13: + w[ 3] = w[ 3] | 0x8000; break; - } -} -static void switch_buffer_by_offset (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) -{ - #ifdef IS_AMD - const int offset_mod_4 = offset & 3; + case 14: + w[ 3] = w[ 3] | 0x800000; + break; - const int offset_minus_4 = 4 - offset; + case 15: + w[ 3] = w[ 3] | 0x80000000; + break; - switch (offset / 4) - { - case 0: - w3[2] = amd_bytealign ( 0, w3[1], offset_minus_4); - w3[1] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w3[0] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w2[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w2[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w2[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w1[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w1[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w1[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w0[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w0[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w0[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[0] = amd_bytealign (w0[0], 0, offset_minus_4); + case 16: + w[ 4] = 0x80; + break; - if (offset_mod_4 == 0) - { - w0[0] = w0[1]; - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } + case 17: + w[ 4] = w[ 4] | 0x8000; + break; + case 18: + w[ 4] = w[ 4] | 0x800000; break; - case 1: - w3[2] = amd_bytealign ( 0, w3[0], offset_minus_4); - w3[1] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w3[0] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w2[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w2[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w1[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w1[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w0[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w0[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[1] = amd_bytealign (w0[0], 0, offset_minus_4); - w0[0] = 0; + case 19: + w[ 4] = w[ 4] | 0x80000000; + break; + + case 20: + w[ 5] = 0x80; + break; - if (offset_mod_4 == 0) - { - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } + case 21: + w[ 5] = w[ 5] | 0x8000; + break; + case 22: + w[ 5] = w[ 5] | 0x800000; break; - case 2: - w3[2] = amd_bytealign ( 0, w2[3], offset_minus_4); - w3[1] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w3[0] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w2[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w1[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w0[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[2] = amd_bytealign (w0[0], 0, offset_minus_4); - w0[1] = 0; - w0[0] = 0; + case 23: + w[ 5] = w[ 5] | 0x80000000; + break; - if (offset_mod_4 == 0) - { - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } + case 24: + w[ 6] = 0x80; + break; + case 25: + w[ 6] = w[ 6] | 0x8000; break; - case 3: - w3[2] = amd_bytealign ( 0, w2[2], offset_minus_4); - w3[1] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w3[0] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[3] = amd_bytealign (w0[0], 0, offset_minus_4); - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 26: + w[ 6] = w[ 6] | 0x800000; + break; - if (offset_mod_4 == 0) - { - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } + case 27: + w[ 6] = w[ 6] | 0x80000000; + break; + case 28: + w[ 7] = 0x80; break; - case 4: - w3[2] = amd_bytealign ( 0, w2[1], offset_minus_4); - w3[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w3[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[0] = amd_bytealign (w0[0], 0, offset_minus_4); - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 29: + w[ 7] = w[ 7] | 0x8000; + break; - if (offset_mod_4 == 0) - { - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } + case 30: + w[ 7] = w[ 7] | 0x800000; + break; + case 31: + w[ 7] = w[ 7] | 0x80000000; break; - case 5: - w3[2] = amd_bytealign ( 0, w2[0], offset_minus_4); - w3[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w3[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[1] = amd_bytealign (w0[0], 0, offset_minus_4); - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 32: + w[ 8] = 0x80; + break; - if (offset_mod_4 == 0) - { - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } + case 33: + w[ 8] = w[ 8] | 0x8000; + break; + case 34: + w[ 8] = w[ 8] | 0x800000; break; - case 6: - w3[2] = amd_bytealign ( 0, w1[3], offset_minus_4); - w3[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w3[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[2] = amd_bytealign (w0[0], 0, offset_minus_4); - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 35: + w[ 8] = w[ 8] | 0x80000000; + break; - if (offset_mod_4 == 0) - { - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } + case 36: + w[ 9] = 0x80; + break; + case 37: + w[ 9] = w[ 9] | 0x8000; break; - case 7: - w3[2] = amd_bytealign ( 0, w1[2], offset_minus_4); - w3[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w3[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[3] = amd_bytealign (w0[0], 0, offset_minus_4); - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 38: + w[ 9] = w[ 9] | 0x800000; + break; - if (offset_mod_4 == 0) - { - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } + case 39: + w[ 9] = w[ 9] | 0x80000000; + break; + + case 40: + w[10] = 0x80; + break; + + case 41: + w[10] = w[10] | 0x8000; + break; + case 42: + w[10] = w[10] | 0x800000; break; - case 8: - w3[2] = amd_bytealign ( 0, w1[1], offset_minus_4); - w3[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w3[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[0] = amd_bytealign (w0[0], 0, offset_minus_4); - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } + case 43: + w[10] = w[10] | 0x80000000; + break; + case 44: + w[11] = 0x80; break; - case 9: - w3[2] = amd_bytealign ( 0, w1[0], offset_minus_4); - w3[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w3[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[1] = amd_bytealign (w0[0], 0, offset_minus_4); - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 45: + w[11] = w[11] | 0x8000; + break; - if (offset_mod_4 == 0) - { - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } + case 46: + w[11] = w[11] | 0x800000; + break; + case 47: + w[11] = w[11] | 0x80000000; break; - case 10: - w3[2] = amd_bytealign ( 0, w0[3], offset_minus_4); - w3[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w3[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[2] = amd_bytealign (w0[0], 0, offset_minus_4); - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 48: + w[12] = 0x80; + break; - if (offset_mod_4 == 0) - { - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } + case 49: + w[12] = w[12] | 0x8000; + break; + case 50: + w[12] = w[12] | 0x800000; break; - case 11: - w3[2] = amd_bytealign ( 0, w0[2], offset_minus_4); - w3[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w3[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[3] = amd_bytealign (w0[0], 0, offset_minus_4); - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 51: + w[12] = w[12] | 0x80000000; + break; - if (offset_mod_4 == 0) - { - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } + case 52: + w[13] = 0x80; + break; + case 53: + w[13] = w[13] | 0x8000; break; - case 12: - w3[2] = amd_bytealign ( 0, w0[1], offset_minus_4); - w3[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w3[0] = amd_bytealign (w0[0], 0, offset_minus_4); - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 54: + w[13] = w[13] | 0x800000; + break; - if (offset_mod_4 == 0) - { - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } + case 55: + w[13] = w[13] | 0x80000000; + break; + case 56: + w[14] = 0x80; break; - case 13: - w3[2] = amd_bytealign ( 0, w0[0], offset_minus_4); - w3[1] = amd_bytealign (w0[0], 0, offset_minus_4); - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 57: + w[14] = w[14] | 0x8000; + break; - if (offset_mod_4 == 0) - { - w3[1] = w3[2]; - w3[2] = 0; - } + case 58: + w[14] = w[14] | 0x800000; + break; + case 59: + w[14] = w[14] | 0x80000000; break; - } - #endif - #ifdef IS_NV - const int offset_minus_4 = 4 - (offset % 4); + case 60: + w[15] = 0x80; + break; - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + case 61: + w[15] = w[15] | 0x8000; + break; - switch (offset / 4) - { - case 0: - w3[1] = __byte_perm (w3[0], w3[1], selector); - w3[0] = __byte_perm (w2[3], w3[0], selector); - w2[3] = __byte_perm (w2[2], w2[3], selector); - w2[2] = __byte_perm (w2[1], w2[2], selector); - w2[1] = __byte_perm (w2[0], w2[1], selector); - w2[0] = __byte_perm (w1[3], w2[0], selector); - w1[3] = __byte_perm (w1[2], w1[3], selector); - w1[2] = __byte_perm (w1[1], w1[2], selector); - w1[1] = __byte_perm (w1[0], w1[1], selector); - w1[0] = __byte_perm (w0[3], w1[0], selector); - w0[3] = __byte_perm (w0[2], w0[3], selector); - w0[2] = __byte_perm (w0[1], w0[2], selector); - w0[1] = __byte_perm (w0[0], w0[1], selector); - w0[0] = __byte_perm ( 0, w0[0], selector); + case 62: + w[15] = w[15] | 0x800000; + break; + case 63: + w[15] = w[15] | 0x80000000; break; - case 1: - w3[1] = __byte_perm (w2[3], w3[0], selector); - w3[0] = __byte_perm (w2[2], w2[3], selector); - w2[3] = __byte_perm (w2[1], w2[2], selector); - w2[2] = __byte_perm (w2[0], w2[1], selector); - w2[1] = __byte_perm (w1[3], w2[0], selector); - w2[0] = __byte_perm (w1[2], w1[3], selector); - w1[3] = __byte_perm (w1[1], w1[2], selector); - w1[2] = __byte_perm (w1[0], w1[1], selector); - w1[1] = __byte_perm (w0[3], w1[0], selector); - w1[0] = __byte_perm (w0[2], w0[3], selector); - w0[3] = __byte_perm (w0[1], w0[2], selector); - w0[2] = __byte_perm (w0[0], w0[1], selector); - w0[1] = __byte_perm ( 0, w0[0], selector); - w0[0] = 0; + case 64: + w[16] = 0x80; + break; + case 65: + w[16] = w[16] | 0x8000; break; - case 2: - w3[1] = __byte_perm (w2[2], w2[3], selector); - w3[0] = __byte_perm (w2[1], w2[2], selector); - w2[3] = __byte_perm (w2[0], w2[1], selector); - w2[2] = __byte_perm (w1[3], w2[0], selector); - w2[1] = __byte_perm (w1[2], w1[3], selector); - w2[0] = __byte_perm (w1[1], w1[2], selector); - w1[3] = __byte_perm (w1[0], w1[1], selector); - w1[2] = __byte_perm (w0[3], w1[0], selector); - w1[1] = __byte_perm (w0[2], w0[3], selector); - w1[0] = __byte_perm (w0[1], w0[2], selector); - w0[3] = __byte_perm (w0[0], w0[1], selector); - w0[2] = __byte_perm ( 0, w0[0], selector); - w0[1] = 0; - w0[0] = 0; + case 66: + w[16] = w[16] | 0x800000; + break; + case 67: + w[16] = w[16] | 0x80000000; break; - case 3: - w3[1] = __byte_perm (w2[1], w2[2], selector); - w3[0] = __byte_perm (w2[0], w2[1], selector); - w2[3] = __byte_perm (w1[3], w2[0], selector); - w2[2] = __byte_perm (w1[2], w1[3], selector); - w2[1] = __byte_perm (w1[1], w1[2], selector); - w2[0] = __byte_perm (w1[0], w1[1], selector); - w1[3] = __byte_perm (w0[3], w1[0], selector); - w1[2] = __byte_perm (w0[2], w0[3], selector); - w1[1] = __byte_perm (w0[1], w0[2], selector); - w1[0] = __byte_perm (w0[0], w0[1], selector); - w0[3] = __byte_perm ( 0, w0[0], selector); - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 68: + w[17] = 0x80; + break; + case 69: + w[17] = w[17] | 0x8000; break; - case 4: - w3[1] = __byte_perm (w2[0], w2[1], selector); - w3[0] = __byte_perm (w1[3], w2[0], selector); - w2[3] = __byte_perm (w1[2], w1[3], selector); - w2[2] = __byte_perm (w1[1], w1[2], selector); - w2[1] = __byte_perm (w1[0], w1[1], selector); - w2[0] = __byte_perm (w0[3], w1[0], selector); - w1[3] = __byte_perm (w0[2], w0[3], selector); - w1[2] = __byte_perm (w0[1], w0[2], selector); - w1[1] = __byte_perm (w0[0], w0[1], selector); - w1[0] = __byte_perm ( 0, w0[0], selector); - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 70: + w[17] = w[17] | 0x800000; + break; + case 71: + w[17] = w[17] | 0x80000000; break; - case 5: - w3[1] = __byte_perm (w1[3], w2[0], selector); - w3[0] = __byte_perm (w1[2], w1[3], selector); - w2[3] = __byte_perm (w1[1], w1[2], selector); - w2[2] = __byte_perm (w1[0], w1[1], selector); - w2[1] = __byte_perm (w0[3], w1[0], selector); - w2[0] = __byte_perm (w0[2], w0[3], selector); - w1[3] = __byte_perm (w0[1], w0[2], selector); - w1[2] = __byte_perm (w0[0], w0[1], selector); - w1[1] = __byte_perm ( 0, w0[0], selector); - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 72: + w[18] = 0x80; + break; + case 73: + w[18] = w[18] | 0x8000; break; - case 6: - w3[1] = __byte_perm (w1[2], w1[3], selector); - w3[0] = __byte_perm (w1[1], w1[2], selector); - w2[3] = __byte_perm (w1[0], w1[1], selector); - w2[2] = __byte_perm (w0[3], w1[0], selector); - w2[1] = __byte_perm (w0[2], w0[3], selector); - w2[0] = __byte_perm (w0[1], w0[2], selector); - w1[3] = __byte_perm (w0[0], w0[1], selector); - w1[2] = __byte_perm ( 0, w0[0], selector); - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 74: + w[18] = w[18] | 0x800000; + break; + case 75: + w[18] = w[18] | 0x80000000; break; - case 7: - w3[1] = __byte_perm (w1[1], w1[2], selector); - w3[0] = __byte_perm (w1[0], w1[1], selector); - w2[3] = __byte_perm (w0[3], w1[0], selector); - w2[2] = __byte_perm (w0[2], w0[3], selector); - w2[1] = __byte_perm (w0[1], w0[2], selector); - w2[0] = __byte_perm (w0[0], w0[1], selector); - w1[3] = __byte_perm ( 0, w0[0], selector); - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 76: + w[19] = 0x80; + break; + case 77: + w[19] = w[19] | 0x8000; break; - case 8: - w3[1] = __byte_perm (w1[0], w1[1], selector); - w3[0] = __byte_perm (w0[3], w1[0], selector); - w2[3] = __byte_perm (w0[2], w0[3], selector); - w2[2] = __byte_perm (w0[1], w0[2], selector); - w2[1] = __byte_perm (w0[0], w0[1], selector); - w2[0] = __byte_perm ( 0, w0[0], selector); - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 78: + w[19] = w[19] | 0x800000; + break; + case 79: + w[19] = w[19] | 0x80000000; break; - case 9: - w3[1] = __byte_perm (w0[3], w1[0], selector); - w3[0] = __byte_perm (w0[2], w0[3], selector); - w2[3] = __byte_perm (w0[1], w0[2], selector); - w2[2] = __byte_perm (w0[0], w0[1], selector); - w2[1] = __byte_perm ( 0, w0[0], selector); - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 80: + w[20] = 0x80; + break; + case 81: + w[20] = w[20] | 0x8000; break; - case 10: - w3[1] = __byte_perm (w0[2], w0[3], selector); - w3[0] = __byte_perm (w0[1], w0[2], selector); - w2[3] = __byte_perm (w0[0], w0[1], selector); - w2[2] = __byte_perm ( 0, w0[0], selector); - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 82: + w[20] = w[20] | 0x800000; + break; + case 83: + w[20] = w[20] | 0x80000000; break; - case 11: - w3[1] = __byte_perm (w0[1], w0[2], selector); - w3[0] = __byte_perm (w0[0], w0[1], selector); - w2[3] = __byte_perm ( 0, w0[0], selector); - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 84: + w[21] = 0x80; + break; + case 85: + w[21] = w[21] | 0x8000; break; - case 12: - w3[1] = __byte_perm (w0[0], w0[1], selector); - w3[0] = __byte_perm ( 0, w0[0], selector); - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 86: + w[21] = w[21] | 0x800000; + break; + case 87: + w[21] = w[21] | 0x80000000; break; - case 13: - w3[1] = __byte_perm ( 0, w0[0], selector); - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 88: + w[22] = 0x80; + break; + + case 89: + w[22] = w[22] | 0x8000; + break; + case 90: + w[22] = w[22] | 0x800000; break; - } - #endif -} -static void switch_buffer_by_offset_be (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) -{ - #ifdef IS_AMD - switch (offset / 4) - { - case 0: - w3[2] = amd_bytealign (w3[1], 0, offset); - w3[1] = amd_bytealign (w3[0], w3[1], offset); - w3[0] = amd_bytealign (w2[3], w3[0], offset); - w2[3] = amd_bytealign (w2[2], w2[3], offset); - w2[2] = amd_bytealign (w2[1], w2[2], offset); - w2[1] = amd_bytealign (w2[0], w2[1], offset); - w2[0] = amd_bytealign (w1[3], w2[0], offset); - w1[3] = amd_bytealign (w1[2], w1[3], offset); - w1[2] = amd_bytealign (w1[1], w1[2], offset); - w1[1] = amd_bytealign (w1[0], w1[1], offset); - w1[0] = amd_bytealign (w0[3], w1[0], offset); - w0[3] = amd_bytealign (w0[2], w0[3], offset); - w0[2] = amd_bytealign (w0[1], w0[2], offset); - w0[1] = amd_bytealign (w0[0], w0[1], offset); - w0[0] = amd_bytealign ( 0, w0[0], offset); + case 91: + w[22] = w[22] | 0x80000000; + break; + + case 92: + w[23] = 0x80; break; - case 1: - w3[2] = amd_bytealign (w3[0], 0, offset); - w3[1] = amd_bytealign (w2[3], w3[0], offset); - w3[0] = amd_bytealign (w2[2], w2[3], offset); - w2[3] = amd_bytealign (w2[1], w2[2], offset); - w2[2] = amd_bytealign (w2[0], w2[1], offset); - w2[1] = amd_bytealign (w1[3], w2[0], offset); - w2[0] = amd_bytealign (w1[2], w1[3], offset); - w1[3] = amd_bytealign (w1[1], w1[2], offset); - w1[2] = amd_bytealign (w1[0], w1[1], offset); - w1[1] = amd_bytealign (w0[3], w1[0], offset); - w1[0] = amd_bytealign (w0[2], w0[3], offset); - w0[3] = amd_bytealign (w0[1], w0[2], offset); - w0[2] = amd_bytealign (w0[0], w0[1], offset); - w0[1] = amd_bytealign ( 0, w0[0], offset); - w0[0] = 0; + case 93: + w[23] = w[23] | 0x8000; break; - case 2: - w3[2] = amd_bytealign (w2[3], 0, offset); - w3[1] = amd_bytealign (w2[2], w2[3], offset); - w3[0] = amd_bytealign (w2[1], w2[2], offset); - w2[3] = amd_bytealign (w2[0], w2[1], offset); - w2[2] = amd_bytealign (w1[3], w2[0], offset); - w2[1] = amd_bytealign (w1[2], w1[3], offset); - w2[0] = amd_bytealign (w1[1], w1[2], offset); - w1[3] = amd_bytealign (w1[0], w1[1], offset); - w1[2] = amd_bytealign (w0[3], w1[0], offset); - w1[1] = amd_bytealign (w0[2], w0[3], offset); - w1[0] = amd_bytealign (w0[1], w0[2], offset); - w0[3] = amd_bytealign (w0[0], w0[1], offset); - w0[2] = amd_bytealign ( 0, w0[0], offset); - w0[1] = 0; - w0[0] = 0; + case 94: + w[23] = w[23] | 0x800000; break; - case 3: - w3[2] = amd_bytealign (w2[2], 0, offset); - w3[1] = amd_bytealign (w2[1], w2[2], offset); - w3[0] = amd_bytealign (w2[0], w2[1], offset); - w2[3] = amd_bytealign (w1[3], w2[0], offset); - w2[2] = amd_bytealign (w1[2], w1[3], offset); - w2[1] = amd_bytealign (w1[1], w1[2], offset); - w2[0] = amd_bytealign (w1[0], w1[1], offset); - w1[3] = amd_bytealign (w0[3], w1[0], offset); - w1[2] = amd_bytealign (w0[2], w0[3], offset); - w1[1] = amd_bytealign (w0[1], w0[2], offset); - w1[0] = amd_bytealign (w0[0], w0[1], offset); - w0[3] = amd_bytealign ( 0, w0[0], offset); - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 95: + w[23] = w[23] | 0x80000000; break; - case 4: - w3[2] = amd_bytealign (w2[1], 0, offset); - w3[1] = amd_bytealign (w2[0], w2[1], offset); - w3[0] = amd_bytealign (w1[3], w2[0], offset); - w2[3] = amd_bytealign (w1[2], w1[3], offset); - w2[2] = amd_bytealign (w1[1], w1[2], offset); - w2[1] = amd_bytealign (w1[0], w1[1], offset); - w2[0] = amd_bytealign (w0[3], w1[0], offset); - w1[3] = amd_bytealign (w0[2], w0[3], offset); - w1[2] = amd_bytealign (w0[1], w0[2], offset); - w1[1] = amd_bytealign (w0[0], w0[1], offset); - w1[0] = amd_bytealign ( 0, w0[0], offset); - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 96: + w[24] = 0x80; break; - case 5: - w3[2] = amd_bytealign (w2[0], 0, offset); - w3[1] = amd_bytealign (w1[3], w2[0], offset); - w3[0] = amd_bytealign (w1[2], w1[3], offset); - w2[3] = amd_bytealign (w1[1], w1[2], offset); - w2[2] = amd_bytealign (w1[0], w1[1], offset); - w2[1] = amd_bytealign (w0[3], w1[0], offset); - w2[0] = amd_bytealign (w0[2], w0[3], offset); - w1[3] = amd_bytealign (w0[1], w0[2], offset); - w1[2] = amd_bytealign (w0[0], w0[1], offset); - w1[1] = amd_bytealign ( 0, w0[0], offset); - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 97: + w[24] = w[24] | 0x8000; break; - case 6: - w3[2] = amd_bytealign (w1[3], 0, offset); - w3[1] = amd_bytealign (w1[2], w1[3], offset); - w3[0] = amd_bytealign (w1[1], w1[2], offset); - w2[3] = amd_bytealign (w1[0], w1[1], offset); - w2[2] = amd_bytealign (w0[3], w1[0], offset); - w2[1] = amd_bytealign (w0[2], w0[3], offset); - w2[0] = amd_bytealign (w0[1], w0[2], offset); - w1[3] = amd_bytealign (w0[0], w0[1], offset); - w1[2] = amd_bytealign ( 0, w0[0], offset); - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 98: + w[24] = w[24] | 0x800000; break; - case 7: - w3[2] = amd_bytealign (w1[2], 0, offset); - w3[1] = amd_bytealign (w1[1], w1[2], offset); - w3[0] = amd_bytealign (w1[0], w1[1], offset); - w2[3] = amd_bytealign (w0[3], w1[0], offset); - w2[2] = amd_bytealign (w0[2], w0[3], offset); - w2[1] = amd_bytealign (w0[1], w0[2], offset); - w2[0] = amd_bytealign (w0[0], w0[1], offset); - w1[3] = amd_bytealign ( 0, w0[0], offset); - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 99: + w[24] = w[24] | 0x80000000; break; - case 8: - w3[2] = amd_bytealign (w1[1], 0, offset); - w3[1] = amd_bytealign (w1[0], w1[1], offset); - w3[0] = amd_bytealign (w0[3], w1[0], offset); - w2[3] = amd_bytealign (w0[2], w0[3], offset); - w2[2] = amd_bytealign (w0[1], w0[2], offset); - w2[1] = amd_bytealign (w0[0], w0[1], offset); - w2[0] = amd_bytealign ( 0, w0[0], offset); - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 100: + w[25] = 0x80; break; - case 9: - w3[2] = amd_bytealign (w1[0], 0, offset); - w3[1] = amd_bytealign (w0[3], w1[0], offset); - w3[0] = amd_bytealign (w0[2], w0[3], offset); - w2[3] = amd_bytealign (w0[1], w0[2], offset); - w2[2] = amd_bytealign (w0[0], w0[1], offset); - w2[1] = amd_bytealign ( 0, w0[0], offset); - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 101: + w[25] = w[25] | 0x8000; break; - case 10: - w3[2] = amd_bytealign (w0[3], 0, offset); - w3[1] = amd_bytealign (w0[2], w0[3], offset); - w3[0] = amd_bytealign (w0[1], w0[2], offset); - w2[3] = amd_bytealign (w0[0], w0[1], offset); - w2[2] = amd_bytealign ( 0, w0[0], offset); - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 102: + w[25] = w[25] | 0x800000; break; - case 11: - w3[2] = amd_bytealign (w0[2], 0, offset); - w3[1] = amd_bytealign (w0[1], w0[2], offset); - w3[0] = amd_bytealign (w0[0], w0[1], offset); - w2[3] = amd_bytealign ( 0, w0[0], offset); - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 103: + w[25] = w[25] | 0x80000000; break; - case 12: - w3[2] = amd_bytealign (w0[1], 0, offset); - w3[1] = amd_bytealign (w0[0], w0[1], offset); - w3[0] = amd_bytealign ( 0, w0[0], offset); - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 104: + w[26] = 0x80; break; - case 13: - w3[2] = amd_bytealign (w0[0], 0, offset); - w3[1] = amd_bytealign ( 0, w0[0], offset); - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 105: + w[26] = w[26] | 0x8000; + break; + + case 106: + w[26] = w[26] | 0x800000; + break; + + case 107: + w[26] = w[26] | 0x80000000; + break; + + case 108: + w[27] = 0x80; + break; + + case 109: + w[27] = w[27] | 0x8000; + break; + + case 110: + w[27] = w[27] | 0x800000; break; - } - #endif - #ifdef IS_NV - const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + case 111: + w[27] = w[27] | 0x80000000; + break; - switch (offset / 4) - { - case 0: - w3[1] = __byte_perm (w3[1], w3[0], selector); - w3[0] = __byte_perm (w3[0], w2[3], selector); - w2[3] = __byte_perm (w2[3], w2[2], selector); - w2[2] = __byte_perm (w2[2], w2[1], selector); - w2[1] = __byte_perm (w2[1], w2[0], selector); - w2[0] = __byte_perm (w2[0], w1[3], selector); - w1[3] = __byte_perm (w1[3], w1[2], selector); - w1[2] = __byte_perm (w1[2], w1[1], selector); - w1[1] = __byte_perm (w1[1], w1[0], selector); - w1[0] = __byte_perm (w1[0], w0[3], selector); - w0[3] = __byte_perm (w0[3], w0[2], selector); - w0[2] = __byte_perm (w0[2], w0[1], selector); - w0[1] = __byte_perm (w0[1], w0[0], selector); - w0[0] = __byte_perm (w0[0], 0, selector); + case 112: + w[28] = 0x80; break; - case 1: - w3[1] = __byte_perm (w3[0], w2[3], selector); - w3[0] = __byte_perm (w2[3], w2[2], selector); - w2[3] = __byte_perm (w2[2], w2[1], selector); - w2[2] = __byte_perm (w2[1], w2[0], selector); - w2[1] = __byte_perm (w2[0], w1[3], selector); - w2[0] = __byte_perm (w1[3], w1[2], selector); - w1[3] = __byte_perm (w1[2], w1[1], selector); - w1[2] = __byte_perm (w1[1], w1[0], selector); - w1[1] = __byte_perm (w1[0], w0[3], selector); - w1[0] = __byte_perm (w0[3], w0[2], selector); - w0[3] = __byte_perm (w0[2], w0[1], selector); - w0[2] = __byte_perm (w0[1], w0[0], selector); - w0[1] = __byte_perm (w0[0], 0, selector); - w0[0] = 0; + case 113: + w[28] = w[28] | 0x8000; break; - case 2: - w3[1] = __byte_perm (w2[3], w2[2], selector); - w3[0] = __byte_perm (w2[2], w2[1], selector); - w2[3] = __byte_perm (w2[1], w2[0], selector); - w2[2] = __byte_perm (w2[0], w1[3], selector); - w2[1] = __byte_perm (w1[3], w1[2], selector); - w2[0] = __byte_perm (w1[2], w1[1], selector); - w1[3] = __byte_perm (w1[1], w1[0], selector); - w1[2] = __byte_perm (w1[0], w0[3], selector); - w1[1] = __byte_perm (w0[3], w0[2], selector); - w1[0] = __byte_perm (w0[2], w0[1], selector); - w0[3] = __byte_perm (w0[1], w0[0], selector); - w0[2] = __byte_perm (w0[0], 0, selector); - w0[1] = 0; - w0[0] = 0; + case 114: + w[28] = w[28] | 0x800000; break; - case 3: - w3[1] = __byte_perm (w2[2], w2[1], selector); - w3[0] = __byte_perm (w2[1], w2[0], selector); - w2[3] = __byte_perm (w2[0], w1[3], selector); - w2[2] = __byte_perm (w1[3], w1[2], selector); - w2[1] = __byte_perm (w1[2], w1[1], selector); - w2[0] = __byte_perm (w1[1], w1[0], selector); - w1[3] = __byte_perm (w1[0], w0[3], selector); - w1[2] = __byte_perm (w0[3], w0[2], selector); - w1[1] = __byte_perm (w0[2], w0[1], selector); - w1[0] = __byte_perm (w0[1], w0[0], selector); - w0[3] = __byte_perm (w0[0], 0, selector); - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 115: + w[28] = w[28] | 0x80000000; break; - case 4: - w3[1] = __byte_perm (w2[1], w2[0], selector); - w3[0] = __byte_perm (w2[0], w1[3], selector); - w2[3] = __byte_perm (w1[3], w1[2], selector); - w2[2] = __byte_perm (w1[2], w1[1], selector); - w2[1] = __byte_perm (w1[1], w1[0], selector); - w2[0] = __byte_perm (w1[0], w0[3], selector); - w1[3] = __byte_perm (w0[3], w0[2], selector); - w1[2] = __byte_perm (w0[2], w0[1], selector); - w1[1] = __byte_perm (w0[1], w0[0], selector); - w1[0] = __byte_perm (w0[0], 0, selector); - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 116: + w[29] = 0x80; break; - case 5: - w3[1] = __byte_perm (w2[0], w1[3], selector); - w3[0] = __byte_perm (w1[3], w1[2], selector); - w2[3] = __byte_perm (w1[2], w1[1], selector); - w2[2] = __byte_perm (w1[1], w1[0], selector); - w2[1] = __byte_perm (w1[0], w0[3], selector); - w2[0] = __byte_perm (w0[3], w0[2], selector); - w1[3] = __byte_perm (w0[2], w0[1], selector); - w1[2] = __byte_perm (w0[1], w0[0], selector); - w1[1] = __byte_perm (w0[0], 0, selector); - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 117: + w[29] = w[29] | 0x8000; break; - case 6: - w3[1] = __byte_perm (w1[3], w1[2], selector); - w3[0] = __byte_perm (w1[2], w1[1], selector); - w2[3] = __byte_perm (w1[1], w1[0], selector); - w2[2] = __byte_perm (w1[0], w0[3], selector); - w2[1] = __byte_perm (w0[3], w0[2], selector); - w2[0] = __byte_perm (w0[2], w0[1], selector); - w1[3] = __byte_perm (w0[1], w0[0], selector); - w1[2] = __byte_perm (w0[0], 0, selector); - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 118: + w[29] = w[29] | 0x800000; break; - case 7: - w3[1] = __byte_perm (w1[2], w1[1], selector); - w3[0] = __byte_perm (w1[1], w1[0], selector); - w2[3] = __byte_perm (w1[0], w0[3], selector); - w2[2] = __byte_perm (w0[3], w0[2], selector); - w2[1] = __byte_perm (w0[2], w0[1], selector); - w2[0] = __byte_perm (w0[1], w0[0], selector); - w1[3] = __byte_perm (w0[0], 0, selector); - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 119: + w[29] = w[29] | 0x80000000; break; - case 8: - w3[1] = __byte_perm (w1[1], w1[0], selector); - w3[0] = __byte_perm (w1[0], w0[3], selector); - w2[3] = __byte_perm (w0[3], w0[2], selector); - w2[2] = __byte_perm (w0[2], w0[1], selector); - w2[1] = __byte_perm (w0[1], w0[0], selector); - w2[0] = __byte_perm (w0[0], 0, selector); - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 120: + w[30] = 0x80; break; - case 9: - w3[1] = __byte_perm (w1[0], w0[3], selector); - w3[0] = __byte_perm (w0[3], w0[2], selector); - w2[3] = __byte_perm (w0[2], w0[1], selector); - w2[2] = __byte_perm (w0[1], w0[0], selector); - w2[1] = __byte_perm (w0[0], 0, selector); - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 121: + w[30] = w[30] | 0x8000; break; - case 10: - w3[1] = __byte_perm (w0[3], w0[2], selector); - w3[0] = __byte_perm (w0[2], w0[1], selector); - w2[3] = __byte_perm (w0[1], w0[0], selector); - w2[2] = __byte_perm (w0[0], 0, selector); - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 122: + w[30] = w[30] | 0x800000; break; - case 11: - w3[1] = __byte_perm (w0[2], w0[1], selector); - w3[0] = __byte_perm (w0[1], w0[0], selector); - w2[3] = __byte_perm (w0[0], 0, selector); - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 123: + w[30] = w[30] | 0x80000000; break; - case 12: - w3[1] = __byte_perm (w0[1], w0[0], selector); - w3[0] = __byte_perm (w0[0], 0, selector); - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 124: + w[31] = 0x80; break; - case 13: - w3[1] = __byte_perm (w0[0], 0, selector); - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 125: + w[31] = w[31] | 0x8000; + break; + + case 126: + w[31] = w[31] | 0x800000; + break; + + case 127: + w[31] = w[31] | 0x80000000; break; } - #endif } +*/ diff --git a/OpenCL/m00010_a0.cl b/OpenCL/m00010_a0.cl index 6d8262f14..18c07869f 100644 --- a/OpenCL/m00010_a0.cl +++ b/OpenCL/m00010_a0.cl @@ -166,7 +166,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00010_m04 (__glo w3[2] = pw_salt_len * 8; w3[3] = 0; - append_0x80_4 (w0, w1, w2, w3, pw_salt_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_salt_len); /** * md5 @@ -421,7 +421,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00010_s04 (__glo w3[2] = pw_salt_len * 8; w3[3] = 0; - append_0x80_4 (w0, w1, w2, w3, pw_salt_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_salt_len); /** * md5 diff --git a/OpenCL/m00010_a1.cl b/OpenCL/m00010_a1.cl index 956dc5471..b422a71cf 100644 --- a/OpenCL/m00010_a1.cl +++ b/OpenCL/m00010_a1.cl @@ -198,7 +198,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00010_m04 (__glo w3[2] = pw_salt_len * 8; w3[3] = 0; - append_0x80_4 (w0, w1, w2, w3, pw_salt_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_salt_len); /** * md5 @@ -487,7 +487,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00010_s04 (__glo w3[2] = pw_salt_len * 8; w3[3] = 0; - append_0x80_4 (w0, w1, w2, w3, pw_salt_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_salt_len); /** * md5 diff --git a/OpenCL/m00020_a0.cl b/OpenCL/m00020_a0.cl index 6dfd6c90b..8abacc4f6 100644 --- a/OpenCL/m00020_a0.cl +++ b/OpenCL/m00020_a0.cl @@ -143,7 +143,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00020_m04 (__glo w1_t[2] |= salt_buf1[2]; w1_t[3] |= salt_buf1[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); w3_t[2] = out_salt_len * 8; @@ -377,7 +377,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00020_s04 (__glo w1_t[2] |= salt_buf1[2]; w1_t[3] |= salt_buf1[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); w3_t[2] = out_salt_len * 8; diff --git a/OpenCL/m00020_a1.cl b/OpenCL/m00020_a1.cl index 672737f96..2ad50ce3d 100644 --- a/OpenCL/m00020_a1.cl +++ b/OpenCL/m00020_a1.cl @@ -189,7 +189,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00020_m04 (__glo w1_t[2] |= salt_buf1[2]; w1_t[3] |= salt_buf1[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); w3_t[2] = pw_salt_len * 8; @@ -473,7 +473,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00020_s04 (__glo w1_t[2] |= salt_buf1[2]; w1_t[3] |= salt_buf1[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); w3_t[2] = pw_salt_len * 8; diff --git a/OpenCL/m00030_a0.cl b/OpenCL/m00030_a0.cl index ff2e7ac3a..1c2413347 100644 --- a/OpenCL/m00030_a0.cl +++ b/OpenCL/m00030_a0.cl @@ -173,7 +173,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00030_m04 (__glo w3_t[2] |= s3[2]; w3_t[3] |= s3[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); w3_t[2] = out_salt_len * 8; @@ -433,7 +433,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00030_s04 (__glo w3_t[2] |= s3[2]; w3_t[3] |= s3[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); w3_t[2] = out_salt_len * 8; diff --git a/OpenCL/m00030_a1.cl b/OpenCL/m00030_a1.cl index d156010e5..9b76aa411 100644 --- a/OpenCL/m00030_a1.cl +++ b/OpenCL/m00030_a1.cl @@ -219,7 +219,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00030_m04 (__glo w3_t[2] |= s3[2]; w3_t[3] |= s3[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); w3_t[2] = pw_salt_len * 8; @@ -527,7 +527,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00030_s04 (__glo w3_t[2] |= s3[2]; w3_t[3] |= s3[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); w3_t[2] = pw_salt_len * 8; diff --git a/OpenCL/m00040_a0.cl b/OpenCL/m00040_a0.cl index 7eb35bbd2..fcfe3b93c 100644 --- a/OpenCL/m00040_a0.cl +++ b/OpenCL/m00040_a0.cl @@ -135,7 +135,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00040_m04 (__glo w1_t[2] |= salt_buf1[2]; w1_t[3] |= salt_buf1[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); w3_t[2] = out_salt_len * 8; @@ -359,7 +359,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00040_s04 (__glo w1_t[2] |= salt_buf1[2]; w1_t[3] |= salt_buf1[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); w3_t[2] = out_salt_len * 8; diff --git a/OpenCL/m00040_a1.cl b/OpenCL/m00040_a1.cl index 2b2e1fbae..2da6eb408 100644 --- a/OpenCL/m00040_a1.cl +++ b/OpenCL/m00040_a1.cl @@ -183,7 +183,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00040_m04 (__glo w1_t[2] |= salt_buf1[2]; w1_t[3] |= salt_buf1[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); w3_t[2] = pw_salt_len * 8; @@ -455,7 +455,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00040_s04 (__glo w1_t[2] |= salt_buf1[2]; w1_t[3] |= salt_buf1[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); w3_t[2] = pw_salt_len * 8; diff --git a/OpenCL/m00060_a0.cl b/OpenCL/m00060_a0.cl index 0bfd81da7..56ead01df 100644 --- a/OpenCL/m00060_a0.cl +++ b/OpenCL/m00060_a0.cl @@ -330,7 +330,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00060_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); w0_t[0] = w0[0]; w0_t[1] = w0[1]; @@ -505,7 +505,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00060_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); w0_t[0] = w0[0]; w0_t[1] = w0[1]; diff --git a/OpenCL/m00060_a1.cl b/OpenCL/m00060_a1.cl index 4fb02e91f..be66c8780 100644 --- a/OpenCL/m00060_a1.cl +++ b/OpenCL/m00060_a1.cl @@ -382,7 +382,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00060_m04 (__glo w3[2] = 0; w3[3] = 0; - append_0x80_4 (w0, w1, w2, w3, pw_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_len); w0_t[0] = w0[0]; w0_t[1] = w0[1]; @@ -611,7 +611,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00060_s04 (__glo w3[2] = 0; w3[3] = 0; - append_0x80_4 (w0, w1, w2, w3, pw_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_len); w0_t[0] = w0[0]; w0_t[1] = w0[1]; diff --git a/OpenCL/m00060_a3.cl b/OpenCL/m00060_a3.cl index 1f9b124f3..20303206d 100644 --- a/OpenCL/m00060_a3.cl +++ b/OpenCL/m00060_a3.cl @@ -281,7 +281,7 @@ static void m00060m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w0[0] = w0l | w0r; - append_0x80_4 (w0, w1, w2, w3, pw_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_len); w0_t[0] = w0[0]; w0_t[1] = w0[1]; @@ -401,7 +401,7 @@ static void m00060s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w0[0] = w0l | w0r; - append_0x80_4 (w0, w1, w2, w3, pw_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_len); w0_t[0] = w0[0]; w0_t[1] = w0[1]; diff --git a/OpenCL/m00100_a0.cl b/OpenCL/m00100_a0.cl index 568564fb3..d280fc7d2 100644 --- a/OpenCL/m00100_a0.cl +++ b/OpenCL/m00100_a0.cl @@ -90,7 +90,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00100_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); /** * sha1 @@ -319,7 +319,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00100_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); /** * sha1 diff --git a/OpenCL/m00100_a1.cl b/OpenCL/m00100_a1.cl index 160e41b0c..383d7e933 100644 --- a/OpenCL/m00100_a1.cl +++ b/OpenCL/m00100_a1.cl @@ -68,7 +68,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00100_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -113,7 +113,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00100_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - append_0x80_2 (wordr0, wordr1, pw_r_len); + append_0x80_2x4 (wordr0, wordr1, pw_r_len); switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); } @@ -335,7 +335,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00100_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -398,7 +398,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00100_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - append_0x80_2 (wordr0, wordr1, pw_r_len); + append_0x80_2x4 (wordr0, wordr1, pw_r_len); switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); } diff --git a/OpenCL/m00110_a0.cl b/OpenCL/m00110_a0.cl index 1263ab2a5..5e88b61ee 100644 --- a/OpenCL/m00110_a0.cl +++ b/OpenCL/m00110_a0.cl @@ -166,7 +166,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00110_m04 (__glo w3[2] |= s3[2]; w3[3] |= s3[3]; - append_0x80_4 (w0, w1, w2, w3, pw_salt_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_salt_len); /** * sha1 @@ -471,7 +471,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00110_s04 (__glo w3[2] |= s3[2]; w3[3] |= s3[3]; - append_0x80_4 (w0, w1, w2, w3, pw_salt_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_salt_len); /** * sha1 diff --git a/OpenCL/m00110_a1.cl b/OpenCL/m00110_a1.cl index 5bc9a16fc..b28b13bd4 100644 --- a/OpenCL/m00110_a1.cl +++ b/OpenCL/m00110_a1.cl @@ -198,7 +198,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00110_m04 (__glo w3[2] = 0; w3[3] = 0; - append_0x80_4 (w0, w1, w2, w3, pw_salt_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_salt_len); /** * sha1 @@ -537,7 +537,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00110_s04 (__glo w3[2] = 0; w3[3] = 0; - append_0x80_4 (w0, w1, w2, w3, pw_salt_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_salt_len); /** * sha1 diff --git a/OpenCL/m00120_a0.cl b/OpenCL/m00120_a0.cl index fe682eb64..2372f6f1e 100644 --- a/OpenCL/m00120_a0.cl +++ b/OpenCL/m00120_a0.cl @@ -143,7 +143,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00120_m04 (__glo w1_t[2] |= salt_buf1[2]; w1_t[3] |= salt_buf1[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); w3_t[3] = out_salt_len * 8; @@ -427,7 +427,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00120_s04 (__glo w1_t[2] |= salt_buf1[2]; w1_t[3] |= salt_buf1[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); w3_t[3] = out_salt_len * 8; diff --git a/OpenCL/m00120_a1.cl b/OpenCL/m00120_a1.cl index c73c2e076..b7172e035 100644 --- a/OpenCL/m00120_a1.cl +++ b/OpenCL/m00120_a1.cl @@ -189,7 +189,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00120_m04 (__glo w1_t[2] |= salt_buf1[2]; w1_t[3] |= salt_buf1[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); w3_t[3] = pw_salt_len * 8; @@ -521,7 +521,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00120_s04 (__glo w1_t[2] |= salt_buf1[2]; w1_t[3] |= salt_buf1[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); w3_t[3] = pw_salt_len * 8; diff --git a/OpenCL/m00130_a0.cl b/OpenCL/m00130_a0.cl index 491f45127..ade4da723 100644 --- a/OpenCL/m00130_a0.cl +++ b/OpenCL/m00130_a0.cl @@ -173,7 +173,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00130_m04 (__glo w3_t[2] |= s3[2]; w3_t[3] |= s3[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); w3_t[3] = out_salt_len * 8; @@ -485,7 +485,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00130_s04 (__glo w3_t[2] |= s3[2]; w3_t[3] |= s3[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); w3_t[3] = out_salt_len * 8; diff --git a/OpenCL/m00130_a1.cl b/OpenCL/m00130_a1.cl index 6b556a833..12a642dac 100644 --- a/OpenCL/m00130_a1.cl +++ b/OpenCL/m00130_a1.cl @@ -219,7 +219,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00130_m04 (__glo w3_t[2] |= s3[2]; w3_t[3] |= s3[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); w3_t[3] = pw_salt_len * 8; @@ -579,7 +579,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00130_s04 (__glo w3_t[2] |= s3[2]; w3_t[3] |= s3[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); w3_t[3] = pw_salt_len * 8; diff --git a/OpenCL/m00140_a0.cl b/OpenCL/m00140_a0.cl index 466b7cd74..90b368d20 100644 --- a/OpenCL/m00140_a0.cl +++ b/OpenCL/m00140_a0.cl @@ -137,7 +137,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00140_m04 (__glo w1_t[2] |= salt_buf1[2]; w1_t[3] |= salt_buf1[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); w3_t[3] = out_salt_len * 8; @@ -413,7 +413,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00140_s04 (__glo w1_t[2] |= salt_buf1[2]; w1_t[3] |= salt_buf1[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); w3_t[3] = out_salt_len * 8; diff --git a/OpenCL/m00140_a1.cl b/OpenCL/m00140_a1.cl index 6512ef833..3b297d551 100644 --- a/OpenCL/m00140_a1.cl +++ b/OpenCL/m00140_a1.cl @@ -183,7 +183,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00140_m04 (__glo w1_t[2] |= salt_buf1[2]; w1_t[3] |= salt_buf1[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); w3_t[3] = pw_salt_len * 8; @@ -507,7 +507,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00140_s04 (__glo w1_t[2] |= salt_buf1[2]; w1_t[3] |= salt_buf1[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); w3_t[3] = pw_salt_len * 8; diff --git a/OpenCL/m00160_a0.cl b/OpenCL/m00160_a0.cl index 0707e96ec..53db683ea 100644 --- a/OpenCL/m00160_a0.cl +++ b/OpenCL/m00160_a0.cl @@ -362,7 +362,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00160_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); w0_t[0] = swap_workaround (w0[0]); w0_t[1] = swap_workaround (w0[1]); @@ -537,7 +537,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00160_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); w0_t[0] = swap_workaround (w0[0]); w0_t[1] = swap_workaround (w0[1]); diff --git a/OpenCL/m00160_a1.cl b/OpenCL/m00160_a1.cl index f6cee14f3..8b3cddabb 100644 --- a/OpenCL/m00160_a1.cl +++ b/OpenCL/m00160_a1.cl @@ -414,7 +414,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00160_m04 (__glo w3[2] = 0; w3[3] = 0; - append_0x80_4 (w0, w1, w2, w3, pw_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_len); w0_t[0] = swap_workaround (w0[0]); w0_t[1] = swap_workaround (w0[1]); @@ -643,7 +643,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00160_s04 (__glo w3[2] = 0; w3[3] = 0; - append_0x80_4 (w0, w1, w2, w3, pw_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_len); w0_t[0] = swap_workaround (w0[0]); w0_t[1] = swap_workaround (w0[1]); diff --git a/OpenCL/m00190_a0.cl b/OpenCL/m00190_a0.cl index 895c9ded3..bdfe91d0a 100644 --- a/OpenCL/m00190_a0.cl +++ b/OpenCL/m00190_a0.cl @@ -90,7 +90,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00190_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); /** * sha1 @@ -330,7 +330,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00190_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); /** * sha1 diff --git a/OpenCL/m00190_a1.cl b/OpenCL/m00190_a1.cl index e55709dfd..091805445 100644 --- a/OpenCL/m00190_a1.cl +++ b/OpenCL/m00190_a1.cl @@ -68,7 +68,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00190_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -113,7 +113,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00190_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - append_0x80_2 (wordr0, wordr1, pw_r_len); + append_0x80_2x4 (wordr0, wordr1, pw_r_len); switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); } @@ -352,7 +352,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00190_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -409,7 +409,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00190_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - append_0x80_2 (wordr0, wordr1, pw_r_len); + append_0x80_2x4 (wordr0, wordr1, pw_r_len); switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); } diff --git a/OpenCL/m00300_a0.cl b/OpenCL/m00300_a0.cl index 8e2db4aaa..cdc7017be 100644 --- a/OpenCL/m00300_a0.cl +++ b/OpenCL/m00300_a0.cl @@ -90,7 +90,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00300_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); /** * sha1 @@ -444,7 +444,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00300_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); /** * sha1 diff --git a/OpenCL/m00300_a1.cl b/OpenCL/m00300_a1.cl index 57938be17..21635be49 100644 --- a/OpenCL/m00300_a1.cl +++ b/OpenCL/m00300_a1.cl @@ -68,7 +68,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00300_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -113,7 +113,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00300_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - append_0x80_2 (wordr0, wordr1, pw_r_len); + append_0x80_2x4 (wordr0, wordr1, pw_r_len); switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); } @@ -460,7 +460,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00300_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -523,7 +523,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00300_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - append_0x80_2 (wordr0, wordr1, pw_r_len); + append_0x80_2x4 (wordr0, wordr1, pw_r_len); switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); } diff --git a/OpenCL/m00900_a0.cl b/OpenCL/m00900_a0.cl index e6b6ad0fb..19713d327 100644 --- a/OpenCL/m00900_a0.cl +++ b/OpenCL/m00900_a0.cl @@ -90,7 +90,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00900_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); w3[2] = out_len * 8; @@ -247,7 +247,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00900_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); w3[2] = out_len * 8; diff --git a/OpenCL/m00900_a1.cl b/OpenCL/m00900_a1.cl index c4d3a3eab..f30c03aa7 100644 --- a/OpenCL/m00900_a1.cl +++ b/OpenCL/m00900_a1.cl @@ -68,7 +68,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00900_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -265,7 +265,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00900_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } diff --git a/OpenCL/m01000_a0.cl b/OpenCL/m01000_a0.cl index b5ca3b4e0..9309aa98e 100644 --- a/OpenCL/m01000_a0.cl +++ b/OpenCL/m01000_a0.cl @@ -90,7 +90,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01000_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); u32 w0_t[4]; u32 w1_t[4]; @@ -257,7 +257,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01000_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); u32 w0_t[4]; u32 w1_t[4]; diff --git a/OpenCL/m01000_a1.cl b/OpenCL/m01000_a1.cl index 2079407f8..e4f8e2286 100644 --- a/OpenCL/m01000_a1.cl +++ b/OpenCL/m01000_a1.cl @@ -68,7 +68,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01000_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -277,7 +277,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01000_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } diff --git a/OpenCL/m01100_a0.cl b/OpenCL/m01100_a0.cl index 4fb454f54..178f77766 100644 --- a/OpenCL/m01100_a0.cl +++ b/OpenCL/m01100_a0.cl @@ -117,7 +117,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01100_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); u32 w0_t[4]; u32 w1_t[4]; @@ -387,7 +387,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01100_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); u32 w0_t[4]; u32 w1_t[4]; diff --git a/OpenCL/m01100_a1.cl b/OpenCL/m01100_a1.cl index 3cb6f66d7..7bad83ee6 100644 --- a/OpenCL/m01100_a1.cl +++ b/OpenCL/m01100_a1.cl @@ -68,7 +68,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01100_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -380,7 +380,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01100_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } diff --git a/OpenCL/m01400_a0.cl b/OpenCL/m01400_a0.cl index 6f71e3706..fd768e5a8 100644 --- a/OpenCL/m01400_a0.cl +++ b/OpenCL/m01400_a0.cl @@ -90,7 +90,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01400_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); /** * SHA256 @@ -288,7 +288,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01400_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); /** * SHA256 diff --git a/OpenCL/m01400_a1.cl b/OpenCL/m01400_a1.cl index 54893a96d..2facff5ca 100644 --- a/OpenCL/m01400_a1.cl +++ b/OpenCL/m01400_a1.cl @@ -68,7 +68,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01400_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -113,7 +113,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01400_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - append_0x80_2 (wordr0, wordr1, pw_r_len); + append_0x80_2x4 (wordr0, wordr1, pw_r_len); switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); } @@ -304,7 +304,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01400_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -361,7 +361,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01400_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - append_0x80_2 (wordr0, wordr1, pw_r_len); + append_0x80_2x4 (wordr0, wordr1, pw_r_len); switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); } diff --git a/OpenCL/m01410_a0.cl b/OpenCL/m01410_a0.cl index 242071dad..9bd568880 100644 --- a/OpenCL/m01410_a0.cl +++ b/OpenCL/m01410_a0.cl @@ -166,7 +166,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01410_m04 (__glo w3[2] |= s3[2]; w3[3] |= s3[3]; - append_0x80_4 (w0, w1, w2, w3, out_salt_len); + append_0x80_4x4 (w0, w1, w2, w3, out_salt_len); /** * sha256 @@ -440,7 +440,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01410_s04 (__glo w3[2] |= s3[2]; w3[3] |= s3[3]; - append_0x80_4 (w0, w1, w2, w3, out_salt_len); + append_0x80_4x4 (w0, w1, w2, w3, out_salt_len); /** * sha256 diff --git a/OpenCL/m01410_a1.cl b/OpenCL/m01410_a1.cl index c421dd6ba..3a4861ca6 100644 --- a/OpenCL/m01410_a1.cl +++ b/OpenCL/m01410_a1.cl @@ -192,7 +192,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01410_m04 (__glo w3[2] = wordl3[2] | wordr3[2] | s3[2]; w3[3] = wordl3[3] | wordr3[3] | s3[3]; - append_0x80_4 (w0, w1, w2, w3, pw_salt_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_salt_len); /** * sha256 @@ -494,7 +494,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01410_s04 (__glo w3[2] = wordl3[2] | wordr3[2] | s3[2]; w3[3] = wordl3[3] | wordr3[3] | s3[3]; - append_0x80_4 (w0, w1, w2, w3, pw_salt_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_salt_len); /** * sha256 diff --git a/OpenCL/m01420_a0.cl b/OpenCL/m01420_a0.cl index 71e7f4fbd..b513b1b78 100644 --- a/OpenCL/m01420_a0.cl +++ b/OpenCL/m01420_a0.cl @@ -127,7 +127,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01420_m04 (__glo w1[2] |= salt_buf1[2]; w1[3] |= salt_buf1[3]; - append_0x80_4 (w0, w1, w2, w3, out_salt_len); + append_0x80_4x4 (w0, w1, w2, w3, out_salt_len); /** * sha256 @@ -362,7 +362,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01420_s04 (__glo w1[2] |= salt_buf1[2]; w1[3] |= salt_buf1[3]; - append_0x80_4 (w0, w1, w2, w3, out_salt_len); + append_0x80_4x4 (w0, w1, w2, w3, out_salt_len); /** * sha256 diff --git a/OpenCL/m01420_a1.cl b/OpenCL/m01420_a1.cl index f2d365841..a05f22c1c 100644 --- a/OpenCL/m01420_a1.cl +++ b/OpenCL/m01420_a1.cl @@ -167,7 +167,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01420_m04 (__glo w1[2] |= salt_buf1[2]; w1[3] |= salt_buf1[3]; - append_0x80_4 (w0, w1, w2, w3, pw_salt_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_salt_len); /** * sha256 @@ -444,7 +444,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01420_s04 (__glo w1[2] |= salt_buf1[2]; w1[3] |= salt_buf1[3]; - append_0x80_4 (w0, w1, w2, w3, pw_salt_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_salt_len); /** * sha256 diff --git a/OpenCL/m01430_a0.cl b/OpenCL/m01430_a0.cl index 483899d9f..32912d7cb 100644 --- a/OpenCL/m01430_a0.cl +++ b/OpenCL/m01430_a0.cl @@ -171,7 +171,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01430_m04 (__glo w3_t2[2] |= s3[2]; w3_t2[3] |= s3[3]; - append_0x80_4 (w0_t2, w1_t2, w2_t2, w3_t2, out_salt_len); + append_0x80_4x4 (w0_t2, w1_t2, w2_t2, w3_t2, out_salt_len); /** * sha256 @@ -450,7 +450,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01430_s04 (__glo w3_t2[2] |= s3[2]; w3_t2[3] |= s3[3]; - append_0x80_4 (w0_t2, w1_t2, w2_t2, w3_t2, out_salt_len); + append_0x80_4x4 (w0_t2, w1_t2, w2_t2, w3_t2, out_salt_len); /** * sha256 diff --git a/OpenCL/m01430_a1.cl b/OpenCL/m01430_a1.cl index 2b3265010..dee03725b 100644 --- a/OpenCL/m01430_a1.cl +++ b/OpenCL/m01430_a1.cl @@ -211,7 +211,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01430_m04 (__glo w3_t2[2] |= s3[2]; w3_t2[3] |= s3[3]; - append_0x80_4 (w0_t2, w1_t2, w2_t2, w3_t2, pw_salt_len); + append_0x80_4x4 (w0_t2, w1_t2, w2_t2, w3_t2, pw_salt_len); /** * sha256 @@ -532,7 +532,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01430_s04 (__glo w3_t2[2] |= s3[2]; w3_t2[3] |= s3[3]; - append_0x80_4 (w0_t2, w1_t2, w2_t2, w3_t2, pw_salt_len); + append_0x80_4x4 (w0_t2, w1_t2, w2_t2, w3_t2, pw_salt_len); /** * sha256 diff --git a/OpenCL/m01440_a0.cl b/OpenCL/m01440_a0.cl index d6d95c2cc..dd9dfa7df 100644 --- a/OpenCL/m01440_a0.cl +++ b/OpenCL/m01440_a0.cl @@ -129,7 +129,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01440_m04 (__glo w1_t2[2] |= salt_buf1[2]; w1_t2[3] |= salt_buf1[3]; - append_0x80_4 (w0_t2, w1_t2, w2_t2, w3_t2, out_salt_len); + append_0x80_4x4 (w0_t2, w1_t2, w2_t2, w3_t2, out_salt_len); /** * sha256 @@ -366,7 +366,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01440_s04 (__glo w1_t2[2] |= salt_buf1[2]; w1_t2[3] |= salt_buf1[3]; - append_0x80_4 (w0_t2, w1_t2, w2_t2, w3_t2, out_salt_len); + append_0x80_4x4 (w0_t2, w1_t2, w2_t2, w3_t2, out_salt_len); /** * sha256 diff --git a/OpenCL/m01440_a1.cl b/OpenCL/m01440_a1.cl index ef5088e78..f4c93c882 100644 --- a/OpenCL/m01440_a1.cl +++ b/OpenCL/m01440_a1.cl @@ -175,7 +175,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01440_m04 (__glo w1_t2[2] |= salt_buf1[2]; w1_t2[3] |= salt_buf1[3]; - append_0x80_4 (w0_t2, w1_t2, w2_t2, w3_t2, pw_salt_len); + append_0x80_4x4 (w0_t2, w1_t2, w2_t2, w3_t2, pw_salt_len); /** * sha256 @@ -460,7 +460,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01440_s04 (__glo w1_t2[2] |= salt_buf1[2]; w1_t2[3] |= salt_buf1[3]; - append_0x80_4 (w0_t2, w1_t2, w2_t2, w3_t2, pw_salt_len); + append_0x80_4x4 (w0_t2, w1_t2, w2_t2, w3_t2, pw_salt_len); /** * sha256 diff --git a/OpenCL/m01460_a0.cl b/OpenCL/m01460_a0.cl index 99e952e70..9e0d4f567 100644 --- a/OpenCL/m01460_a0.cl +++ b/OpenCL/m01460_a0.cl @@ -352,7 +352,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01460_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); w0_t[0] = swap_workaround (w0[0]); w0_t[1] = swap_workaround (w0[1]); @@ -527,7 +527,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01460_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); w0_t[0] = swap_workaround (w0[0]); w0_t[1] = swap_workaround (w0[1]); diff --git a/OpenCL/m01460_a1.cl b/OpenCL/m01460_a1.cl index 0d902e2a0..894194122 100644 --- a/OpenCL/m01460_a1.cl +++ b/OpenCL/m01460_a1.cl @@ -404,7 +404,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01460_m04 (__glo w3[2] = 0; w3[3] = 0; - append_0x80_4 (w0, w1, w2, w3, pw_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_len); w0_t[0] = swap_workaround (w0[0]); w0_t[1] = swap_workaround (w0[1]); @@ -633,7 +633,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01460_s04 (__glo w3[2] = 0; w3[3] = 0; - append_0x80_4 (w0, w1, w2, w3, pw_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_len); w0_t[0] = swap_workaround (w0[0]); w0_t[1] = swap_workaround (w0[1]); diff --git a/OpenCL/m01600.cl b/OpenCL/m01600.cl index fef28b4f9..4b459fb93 100644 --- a/OpenCL/m01600.cl +++ b/OpenCL/m01600.cl @@ -1750,7 +1750,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01600_init (__gl block_len += pw_len; - append_0x80_4 (block0, block1, block2, block3, block_len); + append_0x80_4x4 (block0, block1, block2, block3, block_len); block3[2] = block_len * 8; @@ -1821,7 +1821,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01600_init (__gl block_len++; } - append_0x80_4 (block0, block1, block2, block3, block_len); + append_0x80_4x4 (block0, block1, block2, block3, block_len); block3[2] = block_len * 8; @@ -1864,7 +1864,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01600_loop (__gl w0_x80[2] = w0[2]; w0_x80[3] = w0[3]; - append_0x80_1 (w0_x80, pw_len); + append_0x80_1x4 (w0_x80, pw_len); /** * salt diff --git a/OpenCL/m01700_a0.cl b/OpenCL/m01700_a0.cl index 2dcd4f4aa..3b26ea952 100644 --- a/OpenCL/m01700_a0.cl +++ b/OpenCL/m01700_a0.cl @@ -211,7 +211,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01700_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); /** * SHA512 @@ -350,7 +350,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01700_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); /** * SHA512 diff --git a/OpenCL/m01700_a1.cl b/OpenCL/m01700_a1.cl index f0c76115c..46e747081 100644 --- a/OpenCL/m01700_a1.cl +++ b/OpenCL/m01700_a1.cl @@ -189,7 +189,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01700_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -234,7 +234,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01700_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - append_0x80_2 (wordr0, wordr1, pw_r_len); + append_0x80_2x4 (wordr0, wordr1, pw_r_len); switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); } @@ -366,7 +366,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01700_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -423,7 +423,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01700_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - append_0x80_2 (wordr0, wordr1, pw_r_len); + append_0x80_2x4 (wordr0, wordr1, pw_r_len); switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); } diff --git a/OpenCL/m01710_a0.cl b/OpenCL/m01710_a0.cl index 8d85e4c64..6e56b6893 100644 --- a/OpenCL/m01710_a0.cl +++ b/OpenCL/m01710_a0.cl @@ -287,7 +287,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01710_m04 (__glo w3[2] |= s3[2]; w3[3] |= s3[3]; - append_0x80_4 (w0, w1, w2, w3, out_salt_len); + append_0x80_4x4 (w0, w1, w2, w3, out_salt_len); /** * sha512 @@ -502,7 +502,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01710_s04 (__glo w3[2] |= s3[2]; w3[3] |= s3[3]; - append_0x80_4 (w0, w1, w2, w3, out_salt_len); + append_0x80_4x4 (w0, w1, w2, w3, out_salt_len); /** * sha512 diff --git a/OpenCL/m01710_a1.cl b/OpenCL/m01710_a1.cl index 56c2ba0ba..d3c96fb97 100644 --- a/OpenCL/m01710_a1.cl +++ b/OpenCL/m01710_a1.cl @@ -313,7 +313,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01710_m04 (__glo w3[2] = wordl3[2] | wordr3[2] | s3[2]; w3[3] = wordl3[3] | wordr3[3] | s3[3]; - append_0x80_4 (w0, w1, w2, w3, pw_salt_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_salt_len); /** * sha512 @@ -556,7 +556,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01710_s04 (__glo w3[2] = wordl3[2] | wordr3[2] | s3[2]; w3[3] = wordl3[3] | wordr3[3] | s3[3]; - append_0x80_4 (w0, w1, w2, w3, pw_salt_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_salt_len); /** * sha512 diff --git a/OpenCL/m01720_a0.cl b/OpenCL/m01720_a0.cl index ca77914cf..b97faeaf2 100644 --- a/OpenCL/m01720_a0.cl +++ b/OpenCL/m01720_a0.cl @@ -248,7 +248,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01720_m04 (__glo w1[2] |= salt_buf1[2]; w1[3] |= salt_buf1[3]; - append_0x80_4 (w0, w1, w2, w3, out_salt_len); + append_0x80_4x4 (w0, w1, w2, w3, out_salt_len); /** * sha512 @@ -424,7 +424,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01720_s04 (__glo w1[2] |= salt_buf1[2]; w1[3] |= salt_buf1[3]; - append_0x80_4 (w0, w1, w2, w3, out_salt_len); + append_0x80_4x4 (w0, w1, w2, w3, out_salt_len); /** * sha512 diff --git a/OpenCL/m01720_a1.cl b/OpenCL/m01720_a1.cl index f87bd7401..c197b5fb9 100644 --- a/OpenCL/m01720_a1.cl +++ b/OpenCL/m01720_a1.cl @@ -288,7 +288,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01720_m04 (__glo w1[2] |= salt_buf1[2]; w1[3] |= salt_buf1[3]; - append_0x80_4 (w0, w1, w2, w3, pw_salt_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_salt_len); /** * sha512 @@ -506,7 +506,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01720_s04 (__glo w1[2] |= salt_buf1[2]; w1[3] |= salt_buf1[3]; - append_0x80_4 (w0, w1, w2, w3, pw_salt_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_salt_len); /** * sha512 diff --git a/OpenCL/m01730_a0.cl b/OpenCL/m01730_a0.cl index ae3f0e9fc..7e9557299 100644 --- a/OpenCL/m01730_a0.cl +++ b/OpenCL/m01730_a0.cl @@ -292,7 +292,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01730_m04 (__glo w3_t[2] |= s3[2]; w3_t[3] |= s3[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); /** * sha512 @@ -507,7 +507,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01730_s04 (__glo w3_t[2] |= s3[2]; w3_t[3] |= s3[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); /** * sha512 diff --git a/OpenCL/m01730_a1.cl b/OpenCL/m01730_a1.cl index 0925b6b2b..deb875cd8 100644 --- a/OpenCL/m01730_a1.cl +++ b/OpenCL/m01730_a1.cl @@ -332,7 +332,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01730_m04 (__glo w3_t[2] |= s3[2]; w3_t[3] |= s3[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); /** * sha512 @@ -589,7 +589,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01730_s04 (__glo w3_t[2] |= s3[2]; w3_t[3] |= s3[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); /** * sha512 diff --git a/OpenCL/m01740_a0.cl b/OpenCL/m01740_a0.cl index b8cf1d474..d272ab9b6 100644 --- a/OpenCL/m01740_a0.cl +++ b/OpenCL/m01740_a0.cl @@ -250,7 +250,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01740_m04 (__glo w1_t[2] |= salt_buf1[2]; w1_t[3] |= salt_buf1[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); /** * sha512 @@ -423,7 +423,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01740_s04 (__glo w1_t[2] |= salt_buf1[2]; w1_t[3] |= salt_buf1[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); /** * sha512 diff --git a/OpenCL/m01740_a1.cl b/OpenCL/m01740_a1.cl index 2e38d54c5..e2d25fd57 100644 --- a/OpenCL/m01740_a1.cl +++ b/OpenCL/m01740_a1.cl @@ -296,7 +296,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01740_m04 (__glo w1_t[2] |= salt_buf1[2]; w1_t[3] |= salt_buf1[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); /** * sha512 @@ -517,7 +517,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01740_s04 (__glo w1_t[2] |= salt_buf1[2]; w1_t[3] |= salt_buf1[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); /** * sha512 diff --git a/OpenCL/m01760_a0.cl b/OpenCL/m01760_a0.cl index e37a0afe8..cd8c7776f 100644 --- a/OpenCL/m01760_a0.cl +++ b/OpenCL/m01760_a0.cl @@ -383,7 +383,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01760_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); w0_t[0] = swap_workaround (w0[0]); w0_t[1] = swap_workaround (w0[1]); @@ -559,7 +559,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01760_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); w0_t[0] = swap_workaround (w0[0]); w0_t[1] = swap_workaround (w0[1]); diff --git a/OpenCL/m01760_a1.cl b/OpenCL/m01760_a1.cl index ab136a0bc..bd9ee2402 100644 --- a/OpenCL/m01760_a1.cl +++ b/OpenCL/m01760_a1.cl @@ -435,7 +435,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01760_m04 (__glo w3[2] = 0; w3[3] = 0; - append_0x80_4 (w0, w1, w2, w3, pw_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_len); w0_t[0] = swap_workaround (w0[0]); w0_t[1] = swap_workaround (w0[1]); @@ -665,7 +665,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01760_s04 (__glo w3[2] = 0; w3[3] = 0; - append_0x80_4 (w0, w1, w2, w3, pw_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_len); w0_t[0] = swap_workaround (w0[0]); w0_t[1] = swap_workaround (w0[1]); diff --git a/OpenCL/m02100.cl b/OpenCL/m02100.cl index 9e6df954d..061c2496f 100644 --- a/OpenCL/m02100.cl +++ b/OpenCL/m02100.cl @@ -373,7 +373,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m02100_init (__gl * generate dcc */ - append_0x80_1 (w0, pw_len); + append_0x80_1x4 (w0, pw_len); make_unicode (w0, w0, w1); @@ -405,7 +405,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m02100_init (__gl w3[2] = (16 + salt_len) * 8; w3[3] = 0; - append_0x80_4 (w0, w1, w2, w3, 16 + salt_len); + append_0x80_4x4 (w0, w1, w2, w3, 16 + salt_len); digest_md4[0] = MD4M_A; digest_md4[1] = MD4M_B; @@ -473,8 +473,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m02100_init (__gl w3[2] = 0; w3[3] = (64 + salt_len + 4) * 8; - append_0x01_4 (w0, w1, w2, w3, salt_len + 3); - append_0x80_4 (w0, w1, w2, w3, salt_len + 4); + append_0x01_4x4 (w0, w1, w2, w3, salt_len + 3); + append_0x80_4x4 (w0, w1, w2, w3, salt_len + 4); w0[0] = swap_workaround (w0[0]); w0[1] = swap_workaround (w0[1]); diff --git a/OpenCL/m02500.cl b/OpenCL/m02500.cl index d5c86dff0..ed996ab24 100644 --- a/OpenCL/m02500.cl +++ b/OpenCL/m02500.cl @@ -545,11 +545,11 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m02500_init (__gl w3[3] = 0; if (j == 1) - append_0x01_3 (w0, w1, w2, salt_len + 3); + append_0x01_3x4 (w0, w1, w2, salt_len + 3); else - append_0x02_3 (w0, w1, w2, salt_len + 3); + append_0x02_3x4 (w0, w1, w2, salt_len + 3); - append_0x80_3 (w0, w1, w2, salt_len + 4); + append_0x80_3x4 (w0, w1, w2, salt_len + 4); w0[0] = swap_workaround (w0[0]); w0[1] = swap_workaround (w0[1]); diff --git a/OpenCL/m02610_a0.cl b/OpenCL/m02610_a0.cl index d1d6da418..3381260f9 100644 --- a/OpenCL/m02610_a0.cl +++ b/OpenCL/m02610_a0.cl @@ -152,7 +152,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m02610_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); w3[2] = out_len * 8; @@ -480,7 +480,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m02610_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); w3[2] = out_len * 8; diff --git a/OpenCL/m02610_a1.cl b/OpenCL/m02610_a1.cl index 9f334a148..cad9768db 100644 --- a/OpenCL/m02610_a1.cl +++ b/OpenCL/m02610_a1.cl @@ -78,7 +78,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m02610_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -446,7 +446,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m02610_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } diff --git a/OpenCL/m02710_a0.cl b/OpenCL/m02710_a0.cl index c603a41ef..5d1a73195 100644 --- a/OpenCL/m02710_a0.cl +++ b/OpenCL/m02710_a0.cl @@ -154,7 +154,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m02710_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); w3[2] = out_len * 8; @@ -567,7 +567,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m02710_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); w3[2] = out_len * 8; diff --git a/OpenCL/m02710_a1.cl b/OpenCL/m02710_a1.cl index 02f7e6096..8a5b0afba 100644 --- a/OpenCL/m02710_a1.cl +++ b/OpenCL/m02710_a1.cl @@ -82,7 +82,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m02710_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -535,7 +535,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m02710_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } diff --git a/OpenCL/m02810_a0.cl b/OpenCL/m02810_a0.cl index ab9f6436a..b13f1be16 100644 --- a/OpenCL/m02810_a0.cl +++ b/OpenCL/m02810_a0.cl @@ -157,7 +157,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m02810_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); w3[2] = out_len * 8; @@ -569,7 +569,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m02810_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); w3[2] = out_len * 8; diff --git a/OpenCL/m02810_a1.cl b/OpenCL/m02810_a1.cl index 6471732c8..c396fca69 100644 --- a/OpenCL/m02810_a1.cl +++ b/OpenCL/m02810_a1.cl @@ -78,7 +78,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m02810_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -530,7 +530,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m02810_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } diff --git a/OpenCL/m03710_a0.cl b/OpenCL/m03710_a0.cl index 9a9900164..61615f4d3 100644 --- a/OpenCL/m03710_a0.cl +++ b/OpenCL/m03710_a0.cl @@ -171,7 +171,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m03710_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); w3[2] = out_len * 8; @@ -553,7 +553,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m03710_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); w3[2] = out_len * 8; diff --git a/OpenCL/m03710_a1.cl b/OpenCL/m03710_a1.cl index c475b8c70..e3b150818 100644 --- a/OpenCL/m03710_a1.cl +++ b/OpenCL/m03710_a1.cl @@ -78,7 +78,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m03710_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -500,7 +500,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m03710_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } diff --git a/OpenCL/m03800_a0.cl b/OpenCL/m03800_a0.cl index 0eac70aae..c58da6510 100644 --- a/OpenCL/m03800_a0.cl +++ b/OpenCL/m03800_a0.cl @@ -262,7 +262,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m03800_m04 (__glo const u32 pw_salt_len = salt_len + out_len + salt_len; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); w3_t[2] = pw_salt_len * 8; @@ -613,7 +613,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m03800_s04 (__glo const u32 pw_salt_len = salt_len + out_len + salt_len; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); w3_t[2] = pw_salt_len * 8; diff --git a/OpenCL/m03800_a1.cl b/OpenCL/m03800_a1.cl index 9731b7bf4..7b829abf3 100644 --- a/OpenCL/m03800_a1.cl +++ b/OpenCL/m03800_a1.cl @@ -313,7 +313,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m03800_m04 (__glo const u32 pw_salt_len = salt_len + pw_len + salt_len; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); w3_t[2] = pw_salt_len * 8; @@ -717,7 +717,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m03800_s04 (__glo const u32 pw_salt_len = salt_len + pw_len + salt_len; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); w3_t[2] = pw_salt_len * 8; diff --git a/OpenCL/m03800_a3.cl b/OpenCL/m03800_a3.cl index 9451f63cc..61490e079 100644 --- a/OpenCL/m03800_a3.cl +++ b/OpenCL/m03800_a3.cl @@ -184,7 +184,7 @@ static void m03800m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] |= s3[2]; w3_t[3] |= s3[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); w3_t[2] = pw_salt_len * 8; @@ -447,7 +447,7 @@ static void m03800s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] |= s3[2]; w3_t[3] |= s3[3]; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); w3_t[2] = pw_salt_len * 8; diff --git a/OpenCL/m04310_a0.cl b/OpenCL/m04310_a0.cl index 7ee75f1eb..84b76529e 100644 --- a/OpenCL/m04310_a0.cl +++ b/OpenCL/m04310_a0.cl @@ -152,7 +152,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04310_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); w3[2] = out_len * 8; @@ -480,7 +480,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04310_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); w3[2] = out_len * 8; diff --git a/OpenCL/m04310_a1.cl b/OpenCL/m04310_a1.cl index 4467889b0..c6b298d45 100644 --- a/OpenCL/m04310_a1.cl +++ b/OpenCL/m04310_a1.cl @@ -78,7 +78,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04310_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -446,7 +446,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04310_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } diff --git a/OpenCL/m04400_a0.cl b/OpenCL/m04400_a0.cl index 175ddc9f1..889389e47 100644 --- a/OpenCL/m04400_a0.cl +++ b/OpenCL/m04400_a0.cl @@ -135,7 +135,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04400_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); /** * sha1 @@ -501,7 +501,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04400_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); /** * sha1 diff --git a/OpenCL/m04400_a1.cl b/OpenCL/m04400_a1.cl index 4e2fd3917..4d41d6785 100644 --- a/OpenCL/m04400_a1.cl +++ b/OpenCL/m04400_a1.cl @@ -78,7 +78,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04400_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -158,7 +158,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04400_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - append_0x80_2 (wordr0, wordr1, pw_r_len); + append_0x80_2x4 (wordr0, wordr1, pw_r_len); switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); } @@ -488,7 +488,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04400_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -580,7 +580,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04400_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - append_0x80_2 (wordr0, wordr1, pw_r_len); + append_0x80_2x4 (wordr0, wordr1, pw_r_len); switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); } diff --git a/OpenCL/m04500_a0.cl b/OpenCL/m04500_a0.cl index f85c9dae9..5d00c94d8 100644 --- a/OpenCL/m04500_a0.cl +++ b/OpenCL/m04500_a0.cl @@ -135,7 +135,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04500_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); /** * sha1 @@ -537,7 +537,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04500_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); /** * sha1 diff --git a/OpenCL/m04500_a1.cl b/OpenCL/m04500_a1.cl index 55f202af8..fd4660e03 100644 --- a/OpenCL/m04500_a1.cl +++ b/OpenCL/m04500_a1.cl @@ -78,7 +78,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04500_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -158,7 +158,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04500_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - append_0x80_2 (wordr0, wordr1, pw_r_len); + append_0x80_2x4 (wordr0, wordr1, pw_r_len); switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); } @@ -517,7 +517,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04500_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -615,7 +615,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04500_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - append_0x80_2 (wordr0, wordr1, pw_r_len); + append_0x80_2x4 (wordr0, wordr1, pw_r_len); switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); } diff --git a/OpenCL/m04700_a0.cl b/OpenCL/m04700_a0.cl index 57daee065..bb04ca7b8 100644 --- a/OpenCL/m04700_a0.cl +++ b/OpenCL/m04700_a0.cl @@ -136,7 +136,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04700_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); /** * md5 @@ -490,7 +490,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04700_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); /** * md5 diff --git a/OpenCL/m04700_a1.cl b/OpenCL/m04700_a1.cl index b6996d50c..407e7ef6f 100644 --- a/OpenCL/m04700_a1.cl +++ b/OpenCL/m04700_a1.cl @@ -79,7 +79,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04700_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -159,7 +159,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04700_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - append_0x80_2 (wordr0, wordr1, pw_r_len); + append_0x80_2x4 (wordr0, wordr1, pw_r_len); switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); } @@ -471,7 +471,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04700_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -569,7 +569,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04700_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - append_0x80_2 (wordr0, wordr1, pw_r_len); + append_0x80_2x4 (wordr0, wordr1, pw_r_len); switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); } diff --git a/OpenCL/m04900_a0.cl b/OpenCL/m04900_a0.cl index e73a36bf7..39562ccde 100644 --- a/OpenCL/m04900_a0.cl +++ b/OpenCL/m04900_a0.cl @@ -200,7 +200,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04900_m04 (__glo const u32 pw_salt_len = salt_len + out_len + salt_len; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); u32 w0 = swap_workaround (w0_t[0]); u32 w1 = swap_workaround (w0_t[1]); @@ -538,7 +538,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04900_s04 (__glo const u32 pw_salt_len = salt_len + out_len + salt_len; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); u32 w0 = swap_workaround (w0_t[0]); u32 w1 = swap_workaround (w0_t[1]); diff --git a/OpenCL/m04900_a1.cl b/OpenCL/m04900_a1.cl index 32358da50..5dd74b73a 100644 --- a/OpenCL/m04900_a1.cl +++ b/OpenCL/m04900_a1.cl @@ -252,7 +252,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04900_m04 (__glo const u32 pw_salt_len = salt_len + pw_len + salt_len; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); u32 w0 = swap_workaround (w0_t[0]); u32 w1 = swap_workaround (w0_t[1]); @@ -644,7 +644,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m04900_s04 (__glo const u32 pw_salt_len = salt_len + pw_len + salt_len; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); u32 w0 = swap_workaround (w0_t[0]); u32 w1 = swap_workaround (w0_t[1]); diff --git a/OpenCL/m04900_a3.cl b/OpenCL/m04900_a3.cl index c5dce610c..712cd2e5d 100644 --- a/OpenCL/m04900_a3.cl +++ b/OpenCL/m04900_a3.cl @@ -119,7 +119,7 @@ static void m04900m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le salt_buf3[2] |= salt_buf3_t[2]; salt_buf3[3] |= salt_buf3_t[3]; - append_0x80_4 (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_salt_len); + append_0x80_4x4 (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_salt_len); /** * loop @@ -432,7 +432,7 @@ static void m04900s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le salt_buf3[2] |= salt_buf3_t[2]; salt_buf3[3] |= salt_buf3_t[3]; - append_0x80_4 (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_salt_len); + append_0x80_4x4 (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_salt_len); /** * loop diff --git a/OpenCL/m05000_a0.cl b/OpenCL/m05000_a0.cl index 08371b44b..7351eccb8 100644 --- a/OpenCL/m05000_a0.cl +++ b/OpenCL/m05000_a0.cl @@ -162,7 +162,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05000_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x01_2 (w0, w1, out_len); + append_0x01_2x4 (w0, w1, out_len); u64 st[25]; @@ -373,7 +373,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05000_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x01_2 (w0, w1, out_len); + append_0x01_2x4 (w0, w1, out_len); u64 st[25]; diff --git a/OpenCL/m05000_a1.cl b/OpenCL/m05000_a1.cl index 1edd1b924..df47782bf 100644 --- a/OpenCL/m05000_a1.cl +++ b/OpenCL/m05000_a1.cl @@ -130,7 +130,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05000_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x01_2 (wordl0, wordl1, pw_l_len); + append_0x01_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -185,7 +185,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05000_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - append_0x01_2 (wordr0, wordr1, pw_r_len); + append_0x01_2x4 (wordr0, wordr1, pw_r_len); switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); } @@ -385,7 +385,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05000_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x01_2 (wordl0, wordl1, pw_l_len); + append_0x01_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -452,7 +452,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05000_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - append_0x01_2 (wordr0, wordr1, pw_r_len); + append_0x01_2x4 (wordr0, wordr1, pw_r_len); switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); } diff --git a/OpenCL/m05100_a0.cl b/OpenCL/m05100_a0.cl index 9e34cd3ec..46838fc3b 100644 --- a/OpenCL/m05100_a0.cl +++ b/OpenCL/m05100_a0.cl @@ -90,7 +90,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05100_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); w3[2] = out_len * 8; @@ -289,7 +289,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05100_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); w3[2] = out_len * 8; diff --git a/OpenCL/m05100_a1.cl b/OpenCL/m05100_a1.cl index 974138dbc..4155d60c8 100644 --- a/OpenCL/m05100_a1.cl +++ b/OpenCL/m05100_a1.cl @@ -68,7 +68,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05100_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -319,7 +319,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05100_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } diff --git a/OpenCL/m05200.cl b/OpenCL/m05200.cl index b160a1cb8..ddc59f3e5 100644 --- a/OpenCL/m05200.cl +++ b/OpenCL/m05200.cl @@ -233,7 +233,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05200_init (__gl const u32 block_len = pw_len + salt_len; - append_0x80_4 (w0, w1, w2, w3, block_len); + append_0x80_4x4 (w0, w1, w2, w3, block_len); /** * init diff --git a/OpenCL/m05500_a0.cl b/OpenCL/m05500_a0.cl index 34a346bf2..572cbe3d1 100644 --- a/OpenCL/m05500_a0.cl +++ b/OpenCL/m05500_a0.cl @@ -645,7 +645,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05500_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); u32 w0_t[4]; u32 w1_t[4]; @@ -891,7 +891,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05500_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); u32 w0_t[4]; u32 w1_t[4]; diff --git a/OpenCL/m05500_a1.cl b/OpenCL/m05500_a1.cl index 9f0ff9d2e..729e77819 100644 --- a/OpenCL/m05500_a1.cl +++ b/OpenCL/m05500_a1.cl @@ -581,7 +581,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05500_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -869,7 +869,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05500_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } diff --git a/OpenCL/m05600_a0.cl b/OpenCL/m05600_a0.cl index d65e1a5e8..c6447de59 100644 --- a/OpenCL/m05600_a0.cl +++ b/OpenCL/m05600_a0.cl @@ -381,7 +381,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05600_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); u32 w0_t[4]; u32 w1_t[4]; @@ -662,7 +662,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05600_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); u32 w0_t[4]; u32 w1_t[4]; diff --git a/OpenCL/m05600_a1.cl b/OpenCL/m05600_a1.cl index 89fc1b34d..92382ade3 100644 --- a/OpenCL/m05600_a1.cl +++ b/OpenCL/m05600_a1.cl @@ -330,7 +330,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05600_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -653,7 +653,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m05600_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } diff --git a/OpenCL/m06000_a0.cl b/OpenCL/m06000_a0.cl index da1d4dcca..7b6014b87 100644 --- a/OpenCL/m06000_a0.cl +++ b/OpenCL/m06000_a0.cl @@ -287,7 +287,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06000_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); u32 wl[16]; @@ -415,7 +415,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06000_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); u32 wl[16]; diff --git a/OpenCL/m06000_a1.cl b/OpenCL/m06000_a1.cl index 94af3b2f1..95553c290 100644 --- a/OpenCL/m06000_a1.cl +++ b/OpenCL/m06000_a1.cl @@ -265,7 +265,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06000_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -435,7 +435,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06000_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } diff --git a/OpenCL/m06100_a0.cl b/OpenCL/m06100_a0.cl index 0ac351b00..b2648715f 100644 --- a/OpenCL/m06100_a0.cl +++ b/OpenCL/m06100_a0.cl @@ -1432,7 +1432,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06100_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); u32 wl[16]; @@ -1594,7 +1594,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06100_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); u32 wl[16]; diff --git a/OpenCL/m06100_a1.cl b/OpenCL/m06100_a1.cl index f644a5be7..1aeab93cb 100644 --- a/OpenCL/m06100_a1.cl +++ b/OpenCL/m06100_a1.cl @@ -1385,7 +1385,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06100_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -1589,7 +1589,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06100_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } diff --git a/OpenCL/m06300.cl b/OpenCL/m06300.cl index c4ddb50f8..593d31105 100644 --- a/OpenCL/m06300.cl +++ b/OpenCL/m06300.cl @@ -1649,7 +1649,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06300_init (__gl block_len += pw_len; - append_0x80_4 (block0, block1, block2, block3, block_len); + append_0x80_4x4 (block0, block1, block2, block3, block_len); block3[2] = block_len * 8; @@ -1715,7 +1715,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06300_init (__gl block_len++; } - append_0x80_4 (block0, block1, block2, block3, block_len); + append_0x80_4x4 (block0, block1, block2, block3, block_len); block3[2] = block_len * 8; @@ -1758,7 +1758,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06300_loop (__gl w0_x80[2] = w0[2]; w0_x80[3] = w0[3]; - append_0x80_1 (w0_x80, pw_len); + append_0x80_1x4 (w0_x80, pw_len); /** * salt diff --git a/OpenCL/m06400.cl b/OpenCL/m06400.cl index d6f9a91b3..2d0490311 100644 --- a/OpenCL/m06400.cl +++ b/OpenCL/m06400.cl @@ -308,9 +308,9 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06400_init (__gl salt_buf3[2] = 0; salt_buf3[3] = 0; - append_0x01_4 (salt_buf0, salt_buf1, salt_buf2, salt_buf3, salt_len + 3); + append_0x01_4x4 (salt_buf0, salt_buf1, salt_buf2, salt_buf3, salt_len + 3); - append_0x80_4 (salt_buf0, salt_buf1, salt_buf2, salt_buf3, salt_len + 4); + append_0x80_4x4 (salt_buf0, salt_buf1, salt_buf2, salt_buf3, salt_len + 4); /** * pads diff --git a/OpenCL/m06500.cl b/OpenCL/m06500.cl index 191716418..7704580bc 100644 --- a/OpenCL/m06500.cl +++ b/OpenCL/m06500.cl @@ -312,9 +312,9 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06500_init (__gl salt_buf3[2] = 0; salt_buf3[3] = 0; - append_0x01_4 (salt_buf0, salt_buf1, salt_buf2, salt_buf3, salt_len + 3); + append_0x01_4x4 (salt_buf0, salt_buf1, salt_buf2, salt_buf3, salt_len + 3); - append_0x80_4 (salt_buf0, salt_buf1, salt_buf2, salt_buf3, salt_len + 4); + append_0x80_4x4 (salt_buf0, salt_buf1, salt_buf2, salt_buf3, salt_len + 4); /** * pads diff --git a/OpenCL/m06600.cl b/OpenCL/m06600.cl index 6518a202c..f45b8a899 100644 --- a/OpenCL/m06600.cl +++ b/OpenCL/m06600.cl @@ -1174,8 +1174,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06600_init (__gl w3[2] = 0; w3[3] = 0; - append_0x01_1 (w0, salt_len + 3); - append_0x80_1 (w0, salt_len + 4); + append_0x01_1x4 (w0, salt_len + 3); + append_0x80_1x4 (w0, salt_len + 4); w0[0] = swap_workaround (w0[0]); w0[1] = swap_workaround (w0[1]); diff --git a/OpenCL/m06700.cl b/OpenCL/m06700.cl index 1b1953a68..d82459389 100644 --- a/OpenCL/m06700.cl +++ b/OpenCL/m06700.cl @@ -318,9 +318,9 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06700_init (__gl salt_buf3[2] = 0; salt_buf3[3] = 0; - append_0x01_4 (salt_buf0, salt_buf1, salt_buf2, salt_buf3, salt_len + 3); + append_0x01_4x4 (salt_buf0, salt_buf1, salt_buf2, salt_buf3, salt_len + 3); - append_0x80_4 (salt_buf0, salt_buf1, salt_buf2, salt_buf3, salt_len + 4); + append_0x80_4x4 (salt_buf0, salt_buf1, salt_buf2, salt_buf3, salt_len + 4); /** * pads diff --git a/OpenCL/m06800.cl b/OpenCL/m06800.cl index c425485bb..6eacfa168 100644 --- a/OpenCL/m06800.cl +++ b/OpenCL/m06800.cl @@ -1312,8 +1312,8 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m06800_init (__gl w3[2] = 0; w3[3] = 0; - append_0x01_3 (w0, w1, w2, salt_len + 3); - append_0x80_3 (w0, w1, w2, salt_len + 4); + append_0x01_3x4 (w0, w1, w2, salt_len + 3); + append_0x80_3x4 (w0, w1, w2, salt_len + 4); w0[0] = swap_workaround (w0[0]); w0[1] = swap_workaround (w0[1]); diff --git a/OpenCL/m07400.cl b/OpenCL/m07400.cl index ece0688de..53fa1fad9 100644 --- a/OpenCL/m07400.cl +++ b/OpenCL/m07400.cl @@ -792,7 +792,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07400_init (__gl block_len = memcat16 (block, block_len, w0, pw_len); - append_0x80_4 (block, block_len); + append_0x80_4x4 (block, block_len); block[15] = swap_workaround (block_len * 8); @@ -867,7 +867,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07400_init (__gl } } - append_0x80_4 (block, block_len); + append_0x80_4x4 (block, block_len); if (block_len >= 56) { @@ -912,7 +912,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07400_init (__gl /* Finish the digest. */ - append_0x80_4 (block, block_len); + append_0x80_4x4 (block, block_len); if (block_len >= 56) { @@ -955,7 +955,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07400_init (__gl /* Finish the digest. */ - append_0x80_4 (block, block_len); + append_0x80_4x4 (block, block_len); if (block_len >= 56) { @@ -1008,7 +1008,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07400_loop (__gl p_bytes_x80[2] = tmps[gid].p_bytes[2]; p_bytes_x80[3] = tmps[gid].p_bytes[3]; - append_0x80_1 (p_bytes_x80, pw_len); + append_0x80_1x4 (p_bytes_x80, pw_len); u32 s_bytes[4]; diff --git a/OpenCL/m07500_a0.cl b/OpenCL/m07500_a0.cl index 65255059e..6e20bab64 100644 --- a/OpenCL/m07500_a0.cl +++ b/OpenCL/m07500_a0.cl @@ -451,7 +451,7 @@ static void kerb_prepare (const u32 w0[4], const u32 w1[4], const u32 pw_len, co // K=MD4(Little_indian(UNICODE(pwd)) - append_0x80_2 (w0_t, w1_t, pw_len); + append_0x80_2x4 (w0_t, w1_t, pw_len); make_unicode (w1_t, w2_t, w3_t); make_unicode (w0_t, w0_t, w1_t); diff --git a/OpenCL/m07500_a1.cl b/OpenCL/m07500_a1.cl index 55679f8af..5bbd454f9 100644 --- a/OpenCL/m07500_a1.cl +++ b/OpenCL/m07500_a1.cl @@ -449,7 +449,7 @@ static void kerb_prepare (const u32 w0[4], const u32 w1[4], const u32 pw_len, co // K=MD4(Little_indian(UNICODE(pwd)) - append_0x80_2 (w0_t, w1_t, pw_len); + append_0x80_2x4 (w0_t, w1_t, pw_len); make_unicode (w1_t, w2_t, w3_t); make_unicode (w0_t, w0_t, w1_t); diff --git a/OpenCL/m07500_a3.cl b/OpenCL/m07500_a3.cl index 7cd4cdbb0..9bdbfaf65 100644 --- a/OpenCL/m07500_a3.cl +++ b/OpenCL/m07500_a3.cl @@ -449,7 +449,7 @@ static void kerb_prepare (const u32 w0[4], const u32 w1[4], const u32 pw_len, co // K=MD4(Little_indian(UNICODE(pwd)) - append_0x80_2 (w0_t, w1_t, pw_len); + append_0x80_2x4 (w0_t, w1_t, pw_len); make_unicode (w1_t, w2_t, w3_t); make_unicode (w0_t, w0_t, w1_t); diff --git a/OpenCL/m07600_a0.cl b/OpenCL/m07600_a0.cl index 2940fdf34..663ad1592 100644 --- a/OpenCL/m07600_a0.cl +++ b/OpenCL/m07600_a0.cl @@ -171,7 +171,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07600_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); /** * sha1 @@ -782,7 +782,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07600_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); /** * sha1 diff --git a/OpenCL/m07600_a1.cl b/OpenCL/m07600_a1.cl index b74fec370..2039e5985 100644 --- a/OpenCL/m07600_a1.cl +++ b/OpenCL/m07600_a1.cl @@ -78,7 +78,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07600_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -194,7 +194,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07600_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - append_0x80_2 (wordr0, wordr1, pw_r_len); + append_0x80_2x4 (wordr0, wordr1, pw_r_len); switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); } @@ -727,7 +727,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07600_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -861,7 +861,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07600_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - append_0x80_2 (wordr0, wordr1, pw_r_len); + append_0x80_2x4 (wordr0, wordr1, pw_r_len); switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); } diff --git a/OpenCL/m07900.cl b/OpenCL/m07900.cl index 23b176de6..66061ea37 100644 --- a/OpenCL/m07900.cl +++ b/OpenCL/m07900.cl @@ -171,7 +171,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07900_init (__gl const u32 pw_len = pws[gid].pw_len; - append_0x80_4 (w0, w1, w2, w3, pw_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_len); /** * salt @@ -274,7 +274,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m07900_loop (__gl const u32 pw_len = pws[gid].pw_len; - append_0x80_4 (w0, w1, w2, w3, pw_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_len); /** * digest diff --git a/OpenCL/m08100_a0.cl b/OpenCL/m08100_a0.cl index 4493d9d13..c5dc31e65 100644 --- a/OpenCL/m08100_a0.cl +++ b/OpenCL/m08100_a0.cl @@ -123,7 +123,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08100_m04 (__glo w3_t[2] = 0; w3_t[3] = (out_salt_len + 1) * 8; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, out_salt_len + 1); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len + 1); /** * sha1 @@ -384,7 +384,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08100_s04 (__glo w3_t[2] = 0; w3_t[3] = (out_salt_len + 1) * 8; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, out_salt_len + 1); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len + 1); /** * sha1 diff --git a/OpenCL/m08100_a1.cl b/OpenCL/m08100_a1.cl index fa5f44b5c..5cc8e448a 100644 --- a/OpenCL/m08100_a1.cl +++ b/OpenCL/m08100_a1.cl @@ -169,7 +169,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08100_m04 (__glo w3_t[2] = 0; w3_t[3] = (pw_salt_len + 1) * 8; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len + 1); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len + 1); /** * sha1 @@ -478,7 +478,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08100_s04 (__glo w3_t[2] = 0; w3_t[3] = (pw_salt_len + 1) * 8; - append_0x80_4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len + 1); + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len + 1); /** * sha1 diff --git a/OpenCL/m08100_a3.cl b/OpenCL/m08100_a3.cl index a26fda4e5..ca8d2cd5c 100644 --- a/OpenCL/m08100_a3.cl +++ b/OpenCL/m08100_a3.cl @@ -440,7 +440,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08100_m04 (__glo w0[2] = swap_workaround (w0[2]); w0[3] = swap_workaround (w0[3]); - append_0x80_2 (w0, w1, pw_len + 1); + append_0x80_2x4 (w0, w1, pw_len + 1); w0[0] = swap_workaround (w0[0]); w0[1] = swap_workaround (w0[1]); @@ -508,7 +508,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08100_m08 (__glo w1[2] = swap_workaround (w1[2]); w1[3] = swap_workaround (w1[3]); - append_0x80_3 (w0, w1, w2, pw_len + 1); + append_0x80_3x4 (w0, w1, w2, pw_len + 1); w0[0] = swap_workaround (w0[0]); w0[1] = swap_workaround (w0[1]); @@ -588,7 +588,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08100_m16 (__glo w3[2] = 0; w3[3] = 0; - append_0x80_4 (w0, w1, w2, w3, pw_len + 1); + append_0x80_4x4 (w0, w1, w2, w3, pw_len + 1); w0[0] = swap_workaround (w0[0]); w0[1] = swap_workaround (w0[1]); @@ -663,7 +663,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08100_s04 (__glo w0[2] = swap_workaround (w0[2]); w0[3] = swap_workaround (w0[3]); - append_0x80_2 (w0, w1, pw_len + 1); + append_0x80_2x4 (w0, w1, pw_len + 1); w0[0] = swap_workaround (w0[0]); w0[1] = swap_workaround (w0[1]); @@ -731,7 +731,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08100_s08 (__glo w1[2] = swap_workaround (w1[2]); w1[3] = swap_workaround (w1[3]); - append_0x80_3 (w0, w1, w2, pw_len + 1); + append_0x80_3x4 (w0, w1, w2, pw_len + 1); w0[0] = swap_workaround (w0[0]); w0[1] = swap_workaround (w0[1]); @@ -811,7 +811,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08100_s16 (__glo w3[2] = 0; w3[3] = 0; - append_0x80_4 (w0, w1, w2, w3, pw_len + 1); + append_0x80_4x4 (w0, w1, w2, w3, pw_len + 1); w0[0] = swap_workaround (w0[0]); w0[1] = swap_workaround (w0[1]); diff --git a/OpenCL/m08400_a0.cl b/OpenCL/m08400_a0.cl index eb869f0f2..26beaff39 100644 --- a/OpenCL/m08400_a0.cl +++ b/OpenCL/m08400_a0.cl @@ -290,7 +290,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08400_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); u32 w0_t[4]; @@ -613,7 +613,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08400_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); u32 w0_t[4]; diff --git a/OpenCL/m08400_a1.cl b/OpenCL/m08400_a1.cl index b2264b4c5..135accee7 100644 --- a/OpenCL/m08400_a1.cl +++ b/OpenCL/m08400_a1.cl @@ -206,7 +206,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08400_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -313,7 +313,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08400_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - append_0x80_2 (wordr0, wordr1, pw_r_len); + append_0x80_2x4 (wordr0, wordr1, pw_r_len); switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); } @@ -573,7 +573,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08400_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -692,7 +692,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08400_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - append_0x80_2 (wordr0, wordr1, pw_r_len); + append_0x80_2x4 (wordr0, wordr1, pw_r_len); switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); } diff --git a/OpenCL/m08800.cl b/OpenCL/m08800.cl index a00588ec8..6ff25255a 100644 --- a/OpenCL/m08800.cl +++ b/OpenCL/m08800.cl @@ -1552,11 +1552,11 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m08800_init (__gl w3[3] = 0; if (j == 1) - append_0x01_3 (w0, w1, w2, salt_len + 3); + append_0x01_3x4 (w0, w1, w2, salt_len + 3); else - append_0x02_3 (w0, w1, w2, salt_len + 3); + append_0x02_3x4 (w0, w1, w2, salt_len + 3); - append_0x80_3 (w0, w1, w2, salt_len + 4); + append_0x80_3x4 (w0, w1, w2, salt_len + 4); w0[0] = swap_workaround (w0[0]); w0[1] = swap_workaround (w0[1]); diff --git a/OpenCL/m09000.cl b/OpenCL/m09000.cl index d938f24ec..3543f8d81 100644 --- a/OpenCL/m09000.cl +++ b/OpenCL/m09000.cl @@ -517,7 +517,7 @@ __kernel void __attribute__((reqd_work_group_size (8, 1, 1))) m09000_init (__glo const u32 pw_len = pws[gid].pw_len; - append_0x80_4 (w0, w1, w2, w3, pw_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_len); /** * salt diff --git a/OpenCL/m09400.cl b/OpenCL/m09400.cl index 731da4ce4..df0c9b584 100644 --- a/OpenCL/m09400.cl +++ b/OpenCL/m09400.cl @@ -1368,7 +1368,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09400_init (__gl u32 pw_len = pws[gid].pw_len; - append_0x80_4 (w0, w1, w2, w3, pw_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_len); make_unicode (w1, w2, w3); make_unicode (w0, w0, w1); diff --git a/OpenCL/m09500.cl b/OpenCL/m09500.cl index dfb6fd8b5..4fc5b74c5 100644 --- a/OpenCL/m09500.cl +++ b/OpenCL/m09500.cl @@ -1098,7 +1098,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09500_init (__gl u32 pw_len = pws[gid].pw_len; - append_0x80_4 (w0, w1, w2, w3, pw_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_len); make_unicode (w1, w2, w3); make_unicode (w0, w0, w1); diff --git a/OpenCL/m09600.cl b/OpenCL/m09600.cl index 312f052f9..1541b3048 100644 --- a/OpenCL/m09600.cl +++ b/OpenCL/m09600.cl @@ -1136,7 +1136,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09600_init (__gl u32 pw_len = pws[gid].pw_len; - append_0x80_4 (w0, w1, w2, w3, pw_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_len); make_unicode (w1, w2, w3); make_unicode (w0, w0, w1); diff --git a/OpenCL/m09700_a0.cl b/OpenCL/m09700_a0.cl index 79c9bc7ae..ce1ad1496 100644 --- a/OpenCL/m09700_a0.cl +++ b/OpenCL/m09700_a0.cl @@ -680,7 +680,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09700_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); u32 w0_t[4]; u32 w1_t[4]; @@ -907,7 +907,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09700_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); u32 w0_t[4]; u32 w1_t[4]; diff --git a/OpenCL/m09700_a1.cl b/OpenCL/m09700_a1.cl index 4ef61a263..e13710ecd 100644 --- a/OpenCL/m09700_a1.cl +++ b/OpenCL/m09700_a1.cl @@ -744,7 +744,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09700_m04 (__glo w3[2] = 0; w3[3] = 0; - append_0x80_2 (w0, w1, pw_len); + append_0x80_2x4 (w0, w1, pw_len); u32 w0_t[4]; u32 w1_t[4]; @@ -1025,7 +1025,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09700_s04 (__glo w3[2] = 0; w3[3] = 0; - append_0x80_2 (w0, w1, pw_len); + append_0x80_2x4 (w0, w1, pw_len); u32 w0_t[4]; u32 w1_t[4]; diff --git a/OpenCL/m09720_a0.cl b/OpenCL/m09720_a0.cl index a27a9a191..d03155255 100644 --- a/OpenCL/m09720_a0.cl +++ b/OpenCL/m09720_a0.cl @@ -543,7 +543,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09720_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); u32 w0_t[4]; u32 w1_t[4]; @@ -689,7 +689,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09720_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); u32 w0_t[4]; u32 w1_t[4]; diff --git a/OpenCL/m09720_a1.cl b/OpenCL/m09720_a1.cl index 44d95a88e..63e89973b 100644 --- a/OpenCL/m09720_a1.cl +++ b/OpenCL/m09720_a1.cl @@ -595,7 +595,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09720_m04 (__glo w3[2] = 0; w3[3] = 0; - append_0x80_2 (w0, w1, pw_len); + append_0x80_2x4 (w0, w1, pw_len); u32 w0_t[4]; u32 w1_t[4]; @@ -795,7 +795,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09720_s04 (__glo w3[2] = 0; w3[3] = 0; - append_0x80_2 (w0, w1, pw_len); + append_0x80_2x4 (w0, w1, pw_len); u32 w0_t[4]; u32 w1_t[4]; diff --git a/OpenCL/m09800_a0.cl b/OpenCL/m09800_a0.cl index f88f3688a..d20242ac1 100644 --- a/OpenCL/m09800_a0.cl +++ b/OpenCL/m09800_a0.cl @@ -370,7 +370,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09800_m04 (__glo const u32 pw_salt_len = (out_len * 2) + salt_len; - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); u32 w0_t[4]; u32 w1_t[4]; @@ -615,7 +615,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09800_s04 (__glo const u32 pw_salt_len = (out_len * 2) + salt_len; - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); u32 w0_t[4]; u32 w1_t[4]; diff --git a/OpenCL/m09800_a1.cl b/OpenCL/m09800_a1.cl index 76b94f5c9..fd770363a 100644 --- a/OpenCL/m09800_a1.cl +++ b/OpenCL/m09800_a1.cl @@ -422,7 +422,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09800_m04 (__glo w3[2] = 0; w3[3] = 0; - append_0x80_2 (w0, w1, pw_len); + append_0x80_2x4 (w0, w1, pw_len); u32 w0_t[4]; u32 w1_t[4]; @@ -721,7 +721,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09800_s04 (__glo w3[2] = 0; w3[3] = 0; - append_0x80_2 (w0, w1, pw_len); + append_0x80_2x4 (w0, w1, pw_len); u32 w0_t[4]; u32 w1_t[4]; diff --git a/OpenCL/m09820_a0.cl b/OpenCL/m09820_a0.cl index d6e3f12ed..fd6d1ad1e 100644 --- a/OpenCL/m09820_a0.cl +++ b/OpenCL/m09820_a0.cl @@ -235,7 +235,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09820_m04 (__glo const u32 pw_salt_len = (out_len * 2) + salt_len; - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); u32 w0_t[4]; u32 w1_t[4]; @@ -416,7 +416,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09820_s04 (__glo const u32 pw_salt_len = (out_len * 2) + salt_len; - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); u32 w0_t[4]; u32 w1_t[4]; diff --git a/OpenCL/m09820_a1.cl b/OpenCL/m09820_a1.cl index 217e6e4e2..a3fea7e8a 100644 --- a/OpenCL/m09820_a1.cl +++ b/OpenCL/m09820_a1.cl @@ -285,7 +285,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09820_m04 (__glo w3[2] = 0; w3[3] = 0; - append_0x80_2 (w0, w1, pw_len); + append_0x80_2x4 (w0, w1, pw_len); u32 w0_t[4]; u32 w1_t[4]; @@ -518,7 +518,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m09820_s04 (__glo w3[2] = 0; w3[3] = 0; - append_0x80_2 (w0, w1, pw_len); + append_0x80_2x4 (w0, w1, pw_len); u32 w0_t[4]; u32 w1_t[4]; diff --git a/OpenCL/m10300.cl b/OpenCL/m10300.cl index 355d807e1..a5266aa25 100644 --- a/OpenCL/m10300.cl +++ b/OpenCL/m10300.cl @@ -250,7 +250,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10300_init (__gl const u32 pw_salt_len = pw_len + salt_len; - append_0x80_4 (w0, w1, w2, w3, pw_salt_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_salt_len); // swaps needed diff --git a/OpenCL/m10420_a1.cl b/OpenCL/m10420_a1.cl index d382b65b0..f83c5dcf5 100644 --- a/OpenCL/m10420_a1.cl +++ b/OpenCL/m10420_a1.cl @@ -178,7 +178,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10420_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -421,7 +421,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10420_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } diff --git a/OpenCL/m10700.cl b/OpenCL/m10700.cl index 94ba2ca68..cebdaa4b9 100644 --- a/OpenCL/m10700.cl +++ b/OpenCL/m10700.cl @@ -1564,7 +1564,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10700_init (__gl block_len += salt_len; - append_0x80_2 (block0, block1, block_len); + append_0x80_2x4 (block0, block1, block_len); block3[3] = swap_workaround (block_len * 8); diff --git a/OpenCL/m10800_a0.cl b/OpenCL/m10800_a0.cl index 59d64d932..c3110d816 100644 --- a/OpenCL/m10800_a0.cl +++ b/OpenCL/m10800_a0.cl @@ -210,7 +210,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10800_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); /** * SHA384 @@ -348,7 +348,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10800_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); /** * SHA384 diff --git a/OpenCL/m10800_a1.cl b/OpenCL/m10800_a1.cl index 501f37897..e367cdf50 100644 --- a/OpenCL/m10800_a1.cl +++ b/OpenCL/m10800_a1.cl @@ -188,7 +188,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10800_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -233,7 +233,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10800_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - append_0x80_2 (wordr0, wordr1, pw_r_len); + append_0x80_2x4 (wordr0, wordr1, pw_r_len); switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); } @@ -364,7 +364,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10800_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -421,7 +421,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m10800_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - append_0x80_2 (wordr0, wordr1, pw_r_len); + append_0x80_2x4 (wordr0, wordr1, pw_r_len); switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); } diff --git a/OpenCL/m11000_a0.cl b/OpenCL/m11000_a0.cl index 3e1cce51c..13a008784 100644 --- a/OpenCL/m11000_a0.cl +++ b/OpenCL/m11000_a0.cl @@ -119,7 +119,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11000_m04 (__glo const u32 pw_salt_len = pw_len + salt_len; - append_0x80_4 (w0, w1, w2, w3, out_len); + append_0x80_4x4 (w0, w1, w2, w3, out_len); /** * prepend salt @@ -459,7 +459,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11000_s04 (__glo const u32 pw_salt_len = pw_len + salt_len; - append_0x80_4 (w0, w1, w2, w3, out_len); + append_0x80_4x4 (w0, w1, w2, w3, out_len); /** * prepend salt diff --git a/OpenCL/m11000_a1.cl b/OpenCL/m11000_a1.cl index d391bd028..cf2b51e3e 100644 --- a/OpenCL/m11000_a1.cl +++ b/OpenCL/m11000_a1.cl @@ -162,7 +162,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11000_m04 (__glo w3[2] = wordl3[2] | wordr3[2]; w3[3] = wordl3[3] | wordr3[3]; - append_0x80_4 (w0, w1, w2, w3, pw_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_len); const u32 pw_salt_len = pw_len + salt_len; @@ -549,7 +549,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11000_s04 (__glo w3[2] = wordl3[2] | wordr3[2]; w3[3] = wordl3[3] | wordr3[3]; - append_0x80_4 (w0, w1, w2, w3, pw_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_len); const u32 pw_salt_len = pw_len + salt_len; diff --git a/OpenCL/m11200_a0.cl b/OpenCL/m11200_a0.cl index fc5f2ef51..de3eac6a1 100644 --- a/OpenCL/m11200_a0.cl +++ b/OpenCL/m11200_a0.cl @@ -102,7 +102,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11200_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); /** * sha1 ($pass) @@ -607,7 +607,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11200_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); /** * sha1 ($pass) diff --git a/OpenCL/m11200_a1.cl b/OpenCL/m11200_a1.cl index 24dadd33c..7279d1410 100644 --- a/OpenCL/m11200_a1.cl +++ b/OpenCL/m11200_a1.cl @@ -68,7 +68,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11200_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -125,7 +125,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11200_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - append_0x80_2 (wordr0, wordr1, pw_r_len); + append_0x80_2x4 (wordr0, wordr1, pw_r_len); switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); } @@ -617,7 +617,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11200_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -686,7 +686,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11200_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - append_0x80_2 (wordr0, wordr1, pw_r_len); + append_0x80_2x4 (wordr0, wordr1, pw_r_len); switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); } diff --git a/OpenCL/m11400_a0.cl b/OpenCL/m11400_a0.cl index 7f5529420..03ca34cda 100644 --- a/OpenCL/m11400_a0.cl +++ b/OpenCL/m11400_a0.cl @@ -922,7 +922,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); const u32 pw_salt_len = salt_len + out_len; @@ -1757,7 +1757,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); const u32 pw_salt_len = salt_len + out_len; diff --git a/OpenCL/m11400_a1.cl b/OpenCL/m11400_a1.cl index 05a15ce91..272d48d60 100644 --- a/OpenCL/m11400_a1.cl +++ b/OpenCL/m11400_a1.cl @@ -790,7 +790,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -1667,7 +1667,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11400_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } diff --git a/OpenCL/m11600.cl b/OpenCL/m11600.cl index 95d102266..7e64dbe25 100644 --- a/OpenCL/m11600.cl +++ b/OpenCL/m11600.cl @@ -1770,7 +1770,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11600_comp (__gl u32 block_len = tmps[gid].block_len; u32 final_len = tmps[gid].final_len; - append_0x80_4 (block, block_len); + append_0x80_4x4 (block, block_len); if (block_len >= 56) { diff --git a/OpenCL/m11700_a0.cl b/OpenCL/m11700_a0.cl index 572c7acd7..00ffbc7a6 100644 --- a/OpenCL/m11700_a0.cl +++ b/OpenCL/m11700_a0.cl @@ -2386,7 +2386,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11700_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, &w[0], &w[1], pw_len); - append_0x01_2 (&w[0], &w[1], out_len); + append_0x01_2x4 (&w[0], &w[1], out_len); /** * reverse message block @@ -2573,7 +2573,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11700_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, &w[0], &w[1], pw_len); - append_0x01_2 (&w[0], &w[1], out_len); + append_0x01_2x4 (&w[0], &w[1], out_len); /** * reverse message block diff --git a/OpenCL/m11700_a1.cl b/OpenCL/m11700_a1.cl index ce49d2119..0d228e434 100644 --- a/OpenCL/m11700_a1.cl +++ b/OpenCL/m11700_a1.cl @@ -2373,7 +2373,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11700_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -2440,7 +2440,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11700_m04 (__glo w[14] = wordl3[1] | wordr3[1]; w[15] = wordl3[1] | wordr3[1]; - append_0x01_4 (&w[0], &w[1], &w[2], &w[3], pw_len); + append_0x01_4x4 (&w[0], &w[1], &w[2], &w[3], pw_len); /** * reverse message block @@ -2616,7 +2616,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11700_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -2683,7 +2683,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11700_s04 (__glo w[14] = wordl3[1] | wordr3[1]; w[15] = wordl3[1] | wordr3[1]; - append_0x01_4 (&w[0], &w[1], &w[2], &w[3], pw_len); + append_0x01_4x4 (&w[0], &w[1], &w[2], &w[3], pw_len); /** * reverse message block diff --git a/OpenCL/m11800_a0.cl b/OpenCL/m11800_a0.cl index 676e65111..622909966 100644 --- a/OpenCL/m11800_a0.cl +++ b/OpenCL/m11800_a0.cl @@ -2386,7 +2386,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11800_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, &w[0], &w[1], pw_len); - append_0x01_2 (&w[0], &w[1], out_len); + append_0x01_2x4 (&w[0], &w[1], out_len); /** * reverse message block @@ -2573,7 +2573,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11800_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, &w[0], &w[1], pw_len); - append_0x01_2 (&w[0], &w[1], out_len); + append_0x01_2x4 (&w[0], &w[1], out_len); /** * reverse message block diff --git a/OpenCL/m11800_a1.cl b/OpenCL/m11800_a1.cl index 943eff5c5..74e22b3b4 100644 --- a/OpenCL/m11800_a1.cl +++ b/OpenCL/m11800_a1.cl @@ -2374,7 +2374,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11800_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -2441,7 +2441,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11800_m04 (__glo w[14] = wordl3[1] | wordr3[1]; w[15] = wordl3[1] | wordr3[1]; - append_0x01_4 (&w[0], &w[1], &w[2], &w[3], pw_len); + append_0x01_4x4 (&w[0], &w[1], &w[2], &w[3], pw_len); /** * reverse message block @@ -2617,7 +2617,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11800_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -2684,7 +2684,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m11800_s04 (__glo w[14] = wordl3[1] | wordr3[1]; w[15] = wordl3[1] | wordr3[1]; - append_0x01_4 (&w[0], &w[1], &w[2], &w[3], pw_len); + append_0x01_4x4 (&w[0], &w[1], &w[2], &w[3], pw_len); /** * reverse message block diff --git a/OpenCL/m12200.cl b/OpenCL/m12200.cl index 1c35b46dc..279ab63d6 100644 --- a/OpenCL/m12200.cl +++ b/OpenCL/m12200.cl @@ -171,7 +171,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m12200_init (__gl u32 pw_len = pws[gid].pw_len; - append_0x80_4 (w0, w1, w2, w3, pw_len); + append_0x80_4x4 (w0, w1, w2, w3, pw_len); w0[0] = swap_workaround (w0[0]); w0[1] = swap_workaround (w0[1]); diff --git a/OpenCL/m12600_a0.cl b/OpenCL/m12600_a0.cl index 231edec7d..3eab9304f 100644 --- a/OpenCL/m12600_a0.cl +++ b/OpenCL/m12600_a0.cl @@ -150,7 +150,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m12600_m04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); /** * sha1 @@ -548,7 +548,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m12600_s04 (__glo const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); - append_0x80_2 (w0, w1, out_len); + append_0x80_2x4 (w0, w1, out_len); /** * sha1 diff --git a/OpenCL/m12600_a1.cl b/OpenCL/m12600_a1.cl index 1c4430ba0..5a1133ad4 100644 --- a/OpenCL/m12600_a1.cl +++ b/OpenCL/m12600_a1.cl @@ -78,7 +78,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m12600_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -173,7 +173,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m12600_m04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - append_0x80_2 (wordr0, wordr1, pw_r_len); + append_0x80_2x4 (wordr0, wordr1, pw_r_len); switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); } @@ -521,7 +521,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m12600_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - append_0x80_2 (wordl0, wordl1, pw_l_len); + append_0x80_2x4 (wordl0, wordl1, pw_l_len); switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } @@ -628,7 +628,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m12600_s04 (__glo if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - append_0x80_2 (wordr0, wordr1, pw_r_len); + append_0x80_2x4 (wordr0, wordr1, pw_r_len); switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); } diff --git a/OpenCL/m12800.cl b/OpenCL/m12800.cl index a42d812d4..a4ec85b06 100644 --- a/OpenCL/m12800.cl +++ b/OpenCL/m12800.cl @@ -413,7 +413,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m12800_init (__gl * generate nthash */ - append_0x80_2 (w0, w1, pw_len); + append_0x80_2x4 (w0, w1, pw_len); make_unicode (w1, w2, w3); make_unicode (w0, w0, w1);