From 6907981f089dd64257fd9045cb3dd6ae1e5dd108 Mon Sep 17 00:00:00 2001 From: jsteube Date: Sun, 20 Aug 2017 12:50:24 +0200 Subject: [PATCH] Backport current state of optimized kernel rule engine to CPU --- OpenCL/inc_rp_optimized.cl | 94 ++- include/rp_kernel_on_cpu_optimized.h | 3 +- src/rp_kernel_on_cpu_optimized.c | 1013 +++++++++++++------------- 3 files changed, 539 insertions(+), 571 deletions(-) diff --git a/OpenCL/inc_rp_optimized.cl b/OpenCL/inc_rp_optimized.cl index 38158c534..f081a2937 100644 --- a/OpenCL/inc_rp_optimized.cl +++ b/OpenCL/inc_rp_optimized.cl @@ -3,6 +3,8 @@ * License.....: MIT */ +#define MAYBE_UNUSED + static u32 generate_cmask (const u32 value) { const u32 rmask = ((value & 0x40404040u) >> 1u) @@ -756,10 +758,6 @@ static void append_block1 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 static void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0[4], const u32 src_l1[4], const u32 src_r0[4], const u32 src_r1[4]) { - const int offset_mod_4 = offset & 3; - - const int offset_minus_4 = 4 - offset_mod_4; - u32 s0 = 0; u32 s1 = 0; u32 s2 = 0; @@ -882,6 +880,10 @@ static void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 #endif #ifdef IS_NV + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; const u32 src_r00 = src_r0[0]; @@ -1022,7 +1024,7 @@ static void reverse_block (u32 in0[4], u32 in1[4], u32 out0[4], u32 out1[4], con out1[3] = swap32_S (tib41[3]); } -static u32 rule_op_mangle_lrest (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_lrest (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { buf0[0] |= (generate_cmask (buf0[0])); buf0[1] |= (generate_cmask (buf0[1])); @@ -1036,7 +1038,7 @@ static u32 rule_op_mangle_lrest (const u32 p0, const u32 p1, u32 buf0[4], u32 bu return in_len; } -static u32 rule_op_mangle_urest (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_urest (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { buf0[0] &= ~(generate_cmask (buf0[0])); buf0[1] &= ~(generate_cmask (buf0[1])); @@ -1050,7 +1052,7 @@ static u32 rule_op_mangle_urest (const u32 p0, const u32 p1, u32 buf0[4], u32 bu return in_len; } -static u32 rule_op_mangle_lrest_ufirst (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_lrest_ufirst (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { rule_op_mangle_lrest (p0, p1, buf0, buf1, in_len); @@ -1059,7 +1061,7 @@ static u32 rule_op_mangle_lrest_ufirst (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -static u32 rule_op_mangle_urest_lfirst (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_urest_lfirst (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { rule_op_mangle_urest (p0, p1, buf0, buf1, in_len); @@ -1068,7 +1070,7 @@ static u32 rule_op_mangle_urest_lfirst (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -static u32 rule_op_mangle_trest (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_trest (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { buf0[0] ^= (generate_cmask (buf0[0])); buf0[1] ^= (generate_cmask (buf0[1])); @@ -1082,7 +1084,7 @@ static u32 rule_op_mangle_trest (const u32 p0, const u32 p1, u32 buf0[4], u32 bu return in_len; } -static u32 rule_op_mangle_toggle_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_toggle_at (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -1103,14 +1105,14 @@ static u32 rule_op_mangle_toggle_at (const u32 p0, const u32 p1, u32 buf0[4], u3 return in_len; } -static u32 rule_op_mangle_reverse (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_reverse (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { reverse_block (buf0, buf1, buf0, buf1, in_len); return in_len; } -static u32 rule_op_mangle_dupeword (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_dupeword (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if ((in_len + in_len) >= 32) return (in_len); @@ -1123,7 +1125,7 @@ static u32 rule_op_mangle_dupeword (const u32 p0, const u32 p1, u32 buf0[4], u32 return out_len; } -static u32 rule_op_mangle_dupeword_times (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_dupeword_times (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if (((in_len * p0) + in_len) >= 32) return (in_len); @@ -1151,7 +1153,7 @@ static u32 rule_op_mangle_dupeword_times (const u32 p0, const u32 p1, u32 buf0[4 return out_len; } -static u32 rule_op_mangle_reflect (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_reflect (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if ((in_len + in_len) >= 32) return (in_len); @@ -1169,7 +1171,7 @@ static u32 rule_op_mangle_reflect (const u32 p0, const u32 p1, u32 buf0[4], u32 return out_len; } -static u32 rule_op_mangle_append (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_append (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if ((in_len + 1) >= 32) return (in_len); @@ -1182,7 +1184,7 @@ static u32 rule_op_mangle_append (const u32 p0, const u32 p1, u32 buf0[4], u32 b return out_len; } -static u32 rule_op_mangle_prepend (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_prepend (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if ((in_len + 1) >= 32) return (in_len); @@ -1197,7 +1199,7 @@ static u32 rule_op_mangle_prepend (const u32 p0, const u32 p1, u32 buf0[4], u32 return out_len; } -static u32 rule_op_mangle_rotate_left (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_rotate_left (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if (in_len == 0) return (in_len); @@ -1221,7 +1223,7 @@ static u32 rule_op_mangle_rotate_left (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -static u32 rule_op_mangle_rotate_right (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_rotate_right (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if (in_len == 0) return (in_len); @@ -1251,7 +1253,7 @@ static u32 rule_op_mangle_rotate_right (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -static u32 rule_op_mangle_delete_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_delete_first (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if (in_len == 0) return (in_len); @@ -1262,7 +1264,7 @@ static u32 rule_op_mangle_delete_first (const u32 p0, const u32 p1, u32 buf0[4], return in_len1; } -static u32 rule_op_mangle_delete_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_delete_last (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if (in_len == 0) return (in_len); @@ -1282,7 +1284,7 @@ static u32 rule_op_mangle_delete_last (const u32 p0, const u32 p1, u32 buf0[4], return in_len1; } -static u32 rule_op_mangle_delete_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_delete_at (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -1357,7 +1359,7 @@ static u32 rule_op_mangle_delete_at (const u32 p0, const u32 p1, u32 buf0[4], u3 return out_len; } -static u32 rule_op_mangle_extract (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_extract (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -1372,7 +1374,7 @@ static u32 rule_op_mangle_extract (const u32 p0, const u32 p1, u32 buf0[4], u32 return out_len; } -static u32 rule_op_mangle_omit (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_omit (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -1458,7 +1460,7 @@ static u32 rule_op_mangle_omit (const u32 p0, const u32 p1, u32 buf0[4], u32 buf return out_len; } -static u32 rule_op_mangle_insert (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_insert (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if (p0 > in_len) return (in_len); @@ -1530,7 +1532,7 @@ static u32 rule_op_mangle_insert (const u32 p0, const u32 p1, u32 buf0[4], u32 b return out_len; } -static u32 rule_op_mangle_overstrike (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_overstrike (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -1553,7 +1555,7 @@ static u32 rule_op_mangle_overstrike (const u32 p0, const u32 p1, u32 buf0[4], u return in_len; } -static u32 rule_op_mangle_truncate_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_truncate_at (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -1562,7 +1564,7 @@ static u32 rule_op_mangle_truncate_at (const u32 p0, const u32 p1, u32 buf0[4], return p0; } -static u32 rule_op_mangle_replace (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_replace (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { const uchar4 tmp0 = (uchar4) (p0); const uchar4 tmp1 = (uchar4) (p1); @@ -1581,7 +1583,7 @@ static u32 rule_op_mangle_replace (const u32 p0, const u32 p1, u32 buf0[4], u32 return in_len; } -static u32 rule_op_mangle_purgechar (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_purgechar (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { u32 out_len = 0; @@ -1622,13 +1624,7 @@ static u32 rule_op_mangle_purgechar (const u32 p0, const u32 p1, u32 buf0[4], u3 return out_len; } -static u32 rule_op_mangle_togglecase_rec (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) -{ - // TODO - return in_len; -} - -static u32 rule_op_mangle_dupechar_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_dupechar_first (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if ( in_len == 0) return (in_len); if ((in_len + p0) >= 32) return (in_len); @@ -1815,7 +1811,7 @@ static u32 rule_op_mangle_dupechar_first (const u32 p0, const u32 p1, u32 buf0[4 return out_len; } -static u32 rule_op_mangle_dupechar_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_dupechar_last (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if ( in_len == 0) return (in_len); if ((in_len + p0) >= 32) return (in_len); @@ -1849,7 +1845,7 @@ static u32 rule_op_mangle_dupechar_last (const u32 p0, const u32 p1, u32 buf0[4] return out_len; } -static u32 rule_op_mangle_dupechar_all (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_dupechar_all (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if ( in_len == 0) return (in_len); if ((in_len + in_len) >= 32) return (in_len); @@ -1882,7 +1878,7 @@ static u32 rule_op_mangle_dupechar_all (const u32 p0, const u32 p1, u32 buf0[4], return out_len; } -static u32 rule_op_mangle_switch_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_switch_first (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if (in_len < 2) return (in_len); @@ -1891,7 +1887,7 @@ static u32 rule_op_mangle_switch_first (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -static u32 rule_op_mangle_switch_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_switch_last (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if (in_len < 2) return (in_len); @@ -1976,7 +1972,7 @@ static u32 rule_op_mangle_switch_last (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -static u32 rule_op_mangle_switch_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_switch_at (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); if (p1 >= in_len) return (in_len); @@ -2223,7 +2219,7 @@ static u32 rule_op_mangle_switch_at (const u32 p0, const u32 p1, u32 buf0[4], u3 return in_len; } -static u32 rule_op_mangle_chr_shiftl (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_chr_shiftl (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -2245,7 +2241,7 @@ static u32 rule_op_mangle_chr_shiftl (const u32 p0, const u32 p1, u32 buf0[4], u return in_len; } -static u32 rule_op_mangle_chr_shiftr (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_chr_shiftr (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -2267,7 +2263,7 @@ static u32 rule_op_mangle_chr_shiftr (const u32 p0, const u32 p1, u32 buf0[4], u return in_len; } -static u32 rule_op_mangle_chr_incr (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_chr_incr (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -2291,7 +2287,7 @@ static u32 rule_op_mangle_chr_incr (const u32 p0, const u32 p1, u32 buf0[4], u32 return in_len; } -static u32 rule_op_mangle_chr_decr (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_chr_decr (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -2315,7 +2311,7 @@ static u32 rule_op_mangle_chr_decr (const u32 p0, const u32 p1, u32 buf0[4], u32 return in_len; } -static u32 rule_op_mangle_replace_np1 (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_replace_np1 (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if ((p0 + 1) >= in_len) return (in_len); @@ -2342,7 +2338,7 @@ static u32 rule_op_mangle_replace_np1 (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -static u32 rule_op_mangle_replace_nm1 (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_replace_nm1 (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if (p0 == 0) return (in_len); @@ -2371,7 +2367,7 @@ static u32 rule_op_mangle_replace_nm1 (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -static u32 rule_op_mangle_dupeblock_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_dupeblock_first (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if (p0 > in_len) return (in_len); @@ -2409,7 +2405,7 @@ static u32 rule_op_mangle_dupeblock_first (const u32 p0, const u32 p1, u32 buf0[ return out_len; } -static u32 rule_op_mangle_dupeblock_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_dupeblock_last (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if (p0 > in_len) return (in_len); @@ -2438,7 +2434,7 @@ static u32 rule_op_mangle_dupeblock_last (const u32 p0, const u32 p1, u32 buf0[4 return out_len; } -static u32 rule_op_mangle_title_sep (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_title_sep (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { buf0[0] |= (generate_cmask (buf0[0])); buf0[1] |= (generate_cmask (buf0[1])); diff --git a/include/rp_kernel_on_cpu_optimized.h b/include/rp_kernel_on_cpu_optimized.h index e3b503c0a..f8b81bed4 100644 --- a/include/rp_kernel_on_cpu_optimized.h +++ b/include/rp_kernel_on_cpu_optimized.h @@ -7,6 +7,7 @@ #define _RP_KERNEL_ON_CPU_OPTIMIZED_H u32 apply_rule_optimized (const u32 name, const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len); -u32 apply_rules_optimized (u32 *cmds, u32 buf0[4], u32 buf1[4], const u32 len); + +u32 apply_rules_optimized (const u32 *cmds, u32 buf0[4], u32 buf1[4], const u32 len); #endif // _RP_KERNEL_ON_CPU_OPTIMIZED_H diff --git a/src/rp_kernel_on_cpu_optimized.c b/src/rp_kernel_on_cpu_optimized.c index 46b43bf0b..3567dca1f 100644 --- a/src/rp_kernel_on_cpu_optimized.c +++ b/src/rp_kernel_on_cpu_optimized.c @@ -9,7 +9,7 @@ #include "rp.h" #include "rp_kernel_on_cpu_optimized.h" -static u32 amd_bytealign (const u32 a, const u32 b, const u32 c) +static u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) { const u64 tmp = ((((u64) (a)) << 32) | ((u64) (b))) >> ((c & 3) * 8); @@ -140,321 +140,26 @@ static void truncate_left (u32 buf0[4], u32 buf1[4], const u32 offset) static void lshift_block (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4]) { - out0[0] = in0[0] >> 8 | in0[1] << 24; - out0[1] = in0[1] >> 8 | in0[2] << 24; - out0[2] = in0[2] >> 8 | in0[3] << 24; - out0[3] = in0[3] >> 8 | in1[0] << 24; - out1[0] = in1[0] >> 8 | in1[1] << 24; - out1[1] = in1[1] >> 8 | in1[2] << 24; - out1[2] = in1[2] >> 8 | in1[3] << 24; - out1[3] = in1[3] >> 8; + out0[0] = amd_bytealign_S (in0[1], in0[0], 1); + out0[1] = amd_bytealign_S (in0[2], in0[1], 1); + out0[2] = amd_bytealign_S (in0[3], in0[2], 1); + out0[3] = amd_bytealign_S (in1[0], in0[3], 1); + out1[0] = amd_bytealign_S (in1[1], in1[0], 1); + out1[1] = amd_bytealign_S (in1[2], in1[1], 1); + out1[2] = amd_bytealign_S (in1[3], in1[2], 1); + out1[3] = amd_bytealign_S ( 0, in1[3], 1); } static void rshift_block (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4]) { - out1[3] = in1[3] << 8 | in1[2] >> 24; - out1[2] = in1[2] << 8 | in1[1] >> 24; - out1[1] = in1[1] << 8 | in1[0] >> 24; - out1[0] = in1[0] << 8 | in0[3] >> 24; - out0[3] = in0[3] << 8 | in0[2] >> 24; - out0[2] = in0[2] << 8 | in0[1] >> 24; - out0[1] = in0[1] << 8 | in0[0] >> 24; - out0[0] = in0[0] << 8; -} - -static void rshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4], const u32 num) -{ - switch (num) - { - case 0: out1[3] = in1[3]; - out1[2] = in1[2]; - out1[1] = in1[1]; - out1[0] = in1[0]; - out0[3] = in0[3]; - out0[2] = in0[2]; - out0[1] = in0[1]; - out0[0] = in0[0]; - break; - case 1: out1[3] = in1[3] << 8 | in1[2] >> 24; - out1[2] = in1[2] << 8 | in1[1] >> 24; - out1[1] = in1[1] << 8 | in1[0] >> 24; - out1[0] = in1[0] << 8 | in0[3] >> 24; - out0[3] = in0[3] << 8 | in0[2] >> 24; - out0[2] = in0[2] << 8 | in0[1] >> 24; - out0[1] = in0[1] << 8 | in0[0] >> 24; - out0[0] = in0[0] << 8; - break; - case 2: out1[3] = in1[3] << 16 | in1[2] >> 16; - out1[2] = in1[2] << 16 | in1[1] >> 16; - out1[1] = in1[1] << 16 | in1[0] >> 16; - out1[0] = in1[0] << 16 | in0[3] >> 16; - out0[3] = in0[3] << 16 | in0[2] >> 16; - out0[2] = in0[2] << 16 | in0[1] >> 16; - out0[1] = in0[1] << 16 | in0[0] >> 16; - out0[0] = in0[0] << 16; - break; - case 3: out1[3] = in1[3] << 24 | in1[2] >> 8; - out1[2] = in1[2] << 24 | in1[1] >> 8; - out1[1] = in1[1] << 24 | in1[0] >> 8; - out1[0] = in1[0] << 24 | in0[3] >> 8; - out0[3] = in0[3] << 24 | in0[2] >> 8; - out0[2] = in0[2] << 24 | in0[1] >> 8; - out0[1] = in0[1] << 24 | in0[0] >> 8; - out0[0] = in0[0] << 24; - break; - case 4: out1[3] = in1[2]; - out1[2] = in1[1]; - out1[1] = in1[0]; - out1[0] = in0[3]; - out0[3] = in0[2]; - out0[2] = in0[1]; - out0[1] = in0[0]; - out0[0] = 0; - break; - case 5: out1[3] = in1[2] << 8 | in1[1] >> 24; - out1[2] = in1[1] << 8 | in1[0] >> 24; - out1[1] = in1[0] << 8 | in0[3] >> 24; - out1[0] = in0[3] << 8 | in0[2] >> 24; - out0[3] = in0[2] << 8 | in0[1] >> 24; - out0[2] = in0[1] << 8 | in0[0] >> 24; - out0[1] = in0[0] << 8; - out0[0] = 0; - break; - case 6: out1[3] = in1[2] << 16 | in1[1] >> 16; - out1[2] = in1[1] << 16 | in1[0] >> 16; - out1[1] = in1[0] << 16 | in0[3] >> 16; - out1[0] = in0[3] << 16 | in0[2] >> 16; - out0[3] = in0[2] << 16 | in0[1] >> 16; - out0[2] = in0[1] << 16 | in0[0] >> 16; - out0[1] = in0[0] << 16; - out0[0] = 0; - break; - case 7: out1[3] = in1[2] << 24 | in1[1] >> 8; - out1[2] = in1[1] << 24 | in1[0] >> 8; - out1[1] = in1[0] << 24 | in0[3] >> 8; - out1[0] = in0[3] << 24 | in0[2] >> 8; - out0[3] = in0[2] << 24 | in0[1] >> 8; - out0[2] = in0[1] << 24 | in0[0] >> 8; - out0[1] = in0[0] << 24; - out0[0] = 0; - break; - case 8: out1[3] = in1[1]; - out1[2] = in1[0]; - out1[1] = in0[3]; - out1[0] = in0[2]; - out0[3] = in0[1]; - out0[2] = in0[0]; - out0[1] = 0; - out0[0] = 0; - break; - case 9: out1[3] = in1[1] << 8 | in1[0] >> 24; - out1[2] = in1[0] << 8 | in0[3] >> 24; - out1[1] = in0[3] << 8 | in0[2] >> 24; - out1[0] = in0[2] << 8 | in0[1] >> 24; - out0[3] = in0[1] << 8 | in0[0] >> 24; - out0[2] = in0[0] << 8; - out0[1] = 0; - out0[0] = 0; - break; - case 10: out1[3] = in1[1] << 16 | in1[0] >> 16; - out1[2] = in1[0] << 16 | in0[3] >> 16; - out1[1] = in0[3] << 16 | in0[2] >> 16; - out1[0] = in0[2] << 16 | in0[1] >> 16; - out0[3] = in0[1] << 16 | in0[0] >> 16; - out0[2] = in0[0] << 16; - out0[1] = 0; - out0[0] = 0; - break; - case 11: out1[3] = in1[1] << 24 | in1[0] >> 8; - out1[2] = in1[0] << 24 | in0[3] >> 8; - out1[1] = in0[3] << 24 | in0[2] >> 8; - out1[0] = in0[2] << 24 | in0[1] >> 8; - out0[3] = in0[1] << 24 | in0[0] >> 8; - out0[2] = in0[0] << 24; - out0[1] = 0; - out0[0] = 0; - break; - case 12: out1[3] = in1[0]; - out1[2] = in0[3]; - out1[1] = in0[2]; - out1[0] = in0[1]; - out0[3] = in0[0]; - out0[2] = 0; - out0[1] = 0; - out0[0] = 0; - break; - case 13: out1[3] = in1[0] << 8 | in0[3] >> 24; - out1[2] = in0[3] << 8 | in0[2] >> 24; - out1[1] = in0[2] << 8 | in0[1] >> 24; - out1[0] = in0[1] << 8 | in0[0] >> 24; - out0[3] = in0[0] << 8; - out0[2] = 0; - out0[1] = 0; - out0[0] = 0; - break; - case 14: out1[3] = in1[0] << 16 | in0[3] >> 16; - out1[2] = in0[3] << 16 | in0[2] >> 16; - out1[1] = in0[2] << 16 | in0[1] >> 16; - out1[0] = in0[1] << 16 | in0[0] >> 16; - out0[3] = in0[0] << 16; - out0[2] = 0; - out0[1] = 0; - out0[0] = 0; - break; - case 15: out1[3] = in1[0] << 24 | in0[3] >> 8; - out1[2] = in0[3] << 24 | in0[2] >> 8; - out1[1] = in0[2] << 24 | in0[1] >> 8; - out1[0] = in0[1] << 24 | in0[0] >> 8; - out0[3] = in0[0] << 24; - out0[2] = 0; - out0[1] = 0; - out0[0] = 0; - break; - case 16: out1[3] = in0[3]; - out1[2] = in0[2]; - out1[1] = in0[1]; - out1[0] = in0[0]; - out0[3] = 0; - out0[2] = 0; - out0[1] = 0; - out0[0] = 0; - break; - case 17: out1[3] = in0[3] << 8 | in0[2] >> 24; - out1[2] = in0[2] << 8 | in0[1] >> 24; - out1[1] = in0[1] << 8 | in0[0] >> 24; - out1[0] = in0[0] << 8; - out0[3] = 0; - out0[2] = 0; - out0[1] = 0; - out0[0] = 0; - break; - case 18: out1[3] = in0[3] << 16 | in0[2] >> 16; - out1[2] = in0[2] << 16 | in0[1] >> 16; - out1[1] = in0[1] << 16 | in0[0] >> 16; - out1[0] = in0[0] << 16; - out0[3] = 0; - out0[2] = 0; - out0[1] = 0; - out0[0] = 0; - break; - case 19: out1[3] = in0[3] << 24 | in0[2] >> 8; - out1[2] = in0[2] << 24 | in0[1] >> 8; - out1[1] = in0[1] << 24 | in0[0] >> 8; - out1[0] = in0[0] << 24; - out0[3] = 0; - out0[2] = 0; - out0[1] = 0; - out0[0] = 0; - break; - case 20: out1[3] = in0[2]; - out1[2] = in0[1]; - out1[1] = in0[0]; - out1[0] = 0; - out0[3] = 0; - out0[2] = 0; - out0[1] = 0; - out0[0] = 0; - break; - case 21: out1[3] = in0[2] << 8 | in0[1] >> 24; - out1[2] = in0[1] << 8 | in0[0] >> 24; - out1[1] = in0[0] << 8; - out1[0] = 0; - out0[3] = 0; - out0[2] = 0; - out0[1] = 0; - out0[0] = 0; - break; - case 22: out1[3] = in0[2] << 16 | in0[1] >> 16; - out1[2] = in0[1] << 16 | in0[0] >> 16; - out1[1] = in0[0] << 16; - out1[0] = 0; - out0[3] = 0; - out0[2] = 0; - out0[1] = 0; - out0[0] = 0; - break; - case 23: out1[3] = in0[2] << 24 | in0[1] >> 8; - out1[2] = in0[1] << 24 | in0[0] >> 8; - out1[1] = in0[0] << 24; - out1[0] = 0; - out0[3] = 0; - out0[2] = 0; - out0[1] = 0; - out0[0] = 0; - break; - case 24: out1[3] = in0[1]; - out1[2] = in0[0]; - out1[1] = 0; - out1[0] = 0; - out0[3] = 0; - out0[2] = 0; - out0[1] = 0; - out0[0] = 0; - break; - case 25: out1[3] = in0[1] << 8 | in0[0] >> 24; - out1[2] = in0[0] << 8; - out1[1] = 0; - out1[0] = 0; - out0[3] = 0; - out0[2] = 0; - out0[1] = 0; - out0[0] = 0; - break; - case 26: out1[3] = in0[1] << 16 | in0[0] >> 16; - out1[2] = in0[0] << 16; - out1[1] = 0; - out1[0] = 0; - out0[3] = 0; - out0[2] = 0; - out0[1] = 0; - out0[0] = 0; - break; - case 27: out1[3] = in0[1] << 24 | in0[0] >> 8; - out1[2] = in0[0] << 24; - out1[1] = 0; - out1[0] = 0; - out0[3] = 0; - out0[2] = 0; - out0[1] = 0; - out0[0] = 0; - break; - case 28: out1[3] = in0[0]; - out1[2] = 0; - out1[1] = 0; - out1[0] = 0; - out0[3] = 0; - out0[2] = 0; - out0[1] = 0; - out0[0] = 0; - break; - case 29: out1[3] = in0[0] << 8; - out1[2] = 0; - out1[1] = 0; - out1[0] = 0; - out0[3] = 0; - out0[2] = 0; - out0[1] = 0; - out0[0] = 0; - break; - case 30: out1[3] = in0[0] << 16; - out1[2] = 0; - out1[1] = 0; - out1[0] = 0; - out0[3] = 0; - out0[2] = 0; - out0[1] = 0; - out0[0] = 0; - break; - case 31: out1[3] = in0[0] << 24; - out1[2] = 0; - out1[1] = 0; - out1[0] = 0; - out0[3] = 0; - out0[2] = 0; - out0[1] = 0; - out0[0] = 0; - break; - } + out1[3] = amd_bytealign_S (in1[3], in1[2], 3); + out1[2] = amd_bytealign_S (in1[2], in1[1], 3); + out1[1] = amd_bytealign_S (in1[1], in1[0], 3); + out1[0] = amd_bytealign_S (in1[0], in0[3], 3); + out0[3] = amd_bytealign_S (in0[3], in0[2], 3); + out0[2] = amd_bytealign_S (in0[2], in0[1], 3); + out0[1] = amd_bytealign_S (in0[1], in0[0], 3); + out0[0] = amd_bytealign_S (in0[0], 0, 3); } static void lshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4], const u32 num) @@ -470,32 +175,32 @@ static void lshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[2] = in1[2]; out1[3] = in1[3]; break; - case 1: out0[0] = in0[0] >> 8 | in0[1] << 24; - out0[1] = in0[1] >> 8 | in0[2] << 24; - out0[2] = in0[2] >> 8 | in0[3] << 24; - out0[3] = in0[3] >> 8 | in1[0] << 24; - out1[0] = in1[0] >> 8 | in1[1] << 24; - out1[1] = in1[1] >> 8 | in1[2] << 24; - out1[2] = in1[2] >> 8 | in1[3] << 24; - out1[3] = in1[3] >> 8; - break; - case 2: out0[0] = in0[0] >> 16 | in0[1] << 16; - out0[1] = in0[1] >> 16 | in0[2] << 16; - out0[2] = in0[2] >> 16 | in0[3] << 16; - out0[3] = in0[3] >> 16 | in1[0] << 16; - out1[0] = in1[0] >> 16 | in1[1] << 16; - out1[1] = in1[1] >> 16 | in1[2] << 16; - out1[2] = in1[2] >> 16 | in1[3] << 16; - out1[3] = in1[3] >> 16; - break; - case 3: out0[0] = in0[0] >> 24 | in0[1] << 8; - out0[1] = in0[1] >> 24 | in0[2] << 8; - out0[2] = in0[2] >> 24 | in0[3] << 8; - out0[3] = in0[3] >> 24 | in1[0] << 8; - out1[0] = in1[0] >> 24 | in1[1] << 8; - out1[1] = in1[1] >> 24 | in1[2] << 8; - out1[2] = in1[2] >> 24 | in1[3] << 8; - out1[3] = in1[3] >> 24; + case 1: out0[0] = amd_bytealign_S (in0[1], in0[0], 1); + out0[1] = amd_bytealign_S (in0[2], in0[1], 1); + out0[2] = amd_bytealign_S (in0[3], in0[2], 1); + out0[3] = amd_bytealign_S (in1[0], in0[3], 1); + out1[0] = amd_bytealign_S (in1[1], in1[0], 1); + out1[1] = amd_bytealign_S (in1[2], in1[1], 1); + out1[2] = amd_bytealign_S (in1[3], in1[2], 1); + out1[3] = amd_bytealign_S ( 0, in1[3], 1); + break; + case 2: out0[0] = amd_bytealign_S (in0[1], in0[0], 2); + out0[1] = amd_bytealign_S (in0[2], in0[1], 2); + out0[2] = amd_bytealign_S (in0[3], in0[2], 2); + out0[3] = amd_bytealign_S (in1[0], in0[3], 2); + out1[0] = amd_bytealign_S (in1[1], in1[0], 2); + out1[1] = amd_bytealign_S (in1[2], in1[1], 2); + out1[2] = amd_bytealign_S (in1[3], in1[2], 2); + out1[3] = amd_bytealign_S ( 0, in1[3], 2); + break; + case 3: out0[0] = amd_bytealign_S (in0[1], in0[0], 3); + out0[1] = amd_bytealign_S (in0[2], in0[1], 3); + out0[2] = amd_bytealign_S (in0[3], in0[2], 3); + out0[3] = amd_bytealign_S (in1[0], in0[3], 3); + out1[0] = amd_bytealign_S (in1[1], in1[0], 3); + out1[1] = amd_bytealign_S (in1[2], in1[1], 3); + out1[2] = amd_bytealign_S (in1[3], in1[2], 3); + out1[3] = amd_bytealign_S ( 0, in1[3], 3); break; case 4: out0[0] = in0[1]; out0[1] = in0[2]; @@ -506,31 +211,31 @@ static void lshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[2] = in1[3]; out1[3] = 0; break; - case 5: out0[0] = in0[1] >> 8 | in0[2] << 24; - out0[1] = in0[2] >> 8 | in0[3] << 24; - out0[2] = in0[3] >> 8 | in1[0] << 24; - out0[3] = in1[0] >> 8 | in1[1] << 24; - out1[0] = in1[1] >> 8 | in1[2] << 24; - out1[1] = in1[2] >> 8 | in1[3] << 24; - out1[2] = in1[3] >> 8; + case 5: out0[0] = amd_bytealign_S (in0[2], in0[1], 1); + out0[1] = amd_bytealign_S (in0[3], in0[2], 1); + out0[2] = amd_bytealign_S (in1[0], in0[3], 1); + out0[3] = amd_bytealign_S (in1[1], in1[0], 1); + out1[0] = amd_bytealign_S (in1[2], in1[1], 1); + out1[1] = amd_bytealign_S (in1[3], in1[2], 1); + out1[2] = amd_bytealign_S ( 0, in1[3], 1); out1[3] = 0; break; - case 6: out0[0] = in0[1] >> 16 | in0[2] << 16; - out0[1] = in0[2] >> 16 | in0[3] << 16; - out0[2] = in0[3] >> 16 | in1[0] << 16; - out0[3] = in1[0] >> 16 | in1[1] << 16; - out1[0] = in1[1] >> 16 | in1[2] << 16; - out1[1] = in1[2] >> 16 | in1[3] << 16; - out1[2] = in1[3] >> 16; + case 6: out0[0] = amd_bytealign_S (in0[2], in0[1], 2); + out0[1] = amd_bytealign_S (in0[3], in0[2], 2); + out0[2] = amd_bytealign_S (in1[0], in0[3], 2); + out0[3] = amd_bytealign_S (in1[1], in1[0], 2); + out1[0] = amd_bytealign_S (in1[2], in1[1], 2); + out1[1] = amd_bytealign_S (in1[3], in1[2], 2); + out1[2] = amd_bytealign_S ( 0, in1[3], 2); out1[3] = 0; break; - case 7: out0[0] = in0[1] >> 24 | in0[2] << 8; - out0[1] = in0[2] >> 24 | in0[3] << 8; - out0[2] = in0[3] >> 24 | in1[0] << 8; - out0[3] = in1[0] >> 24 | in1[1] << 8; - out1[0] = in1[1] >> 24 | in1[2] << 8; - out1[1] = in1[2] >> 24 | in1[3] << 8; - out1[2] = in1[3] >> 24; + case 7: out0[0] = amd_bytealign_S (in0[2], in0[1], 3); + out0[1] = amd_bytealign_S (in0[3], in0[2], 3); + out0[2] = amd_bytealign_S (in1[0], in0[3], 3); + out0[3] = amd_bytealign_S (in1[1], in1[0], 3); + out1[0] = amd_bytealign_S (in1[2], in1[1], 3); + out1[1] = amd_bytealign_S (in1[3], in1[2], 3); + out1[2] = amd_bytealign_S ( 0, in1[3], 3); out1[3] = 0; break; case 8: out0[0] = in0[2]; @@ -542,30 +247,30 @@ static void lshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[2] = 0; out1[3] = 0; break; - case 9: out0[0] = in0[2] >> 8 | in0[3] << 24; - out0[1] = in0[3] >> 8 | in1[0] << 24; - out0[2] = in1[0] >> 8 | in1[1] << 24; - out0[3] = in1[1] >> 8 | in1[2] << 24; - out1[0] = in1[2] >> 8 | in1[3] << 24; - out1[1] = in1[3] >> 8; + case 9: out0[0] = amd_bytealign_S (in0[3], in0[2], 1); + out0[1] = amd_bytealign_S (in1[0], in0[3], 1); + out0[2] = amd_bytealign_S (in1[1], in1[0], 1); + out0[3] = amd_bytealign_S (in1[2], in1[1], 1); + out1[0] = amd_bytealign_S (in1[3], in1[2], 1); + out1[1] = amd_bytealign_S ( 0, in1[3], 1); out1[2] = 0; out1[3] = 0; break; - case 10: out0[0] = in0[2] >> 16 | in0[3] << 16; - out0[1] = in0[3] >> 16 | in1[0] << 16; - out0[2] = in1[0] >> 16 | in1[1] << 16; - out0[3] = in1[1] >> 16 | in1[2] << 16; - out1[0] = in1[2] >> 16 | in1[3] << 16; - out1[1] = in1[3] >> 16; + case 10: out0[0] = amd_bytealign_S (in0[3], in0[2], 2); + out0[1] = amd_bytealign_S (in1[0], in0[3], 2); + out0[2] = amd_bytealign_S (in1[1], in1[0], 2); + out0[3] = amd_bytealign_S (in1[2], in1[1], 2); + out1[0] = amd_bytealign_S (in1[3], in1[2], 2); + out1[1] = amd_bytealign_S ( 0, in1[3], 2); out1[2] = 0; out1[3] = 0; break; - case 11: out0[0] = in0[2] >> 24 | in0[3] << 8; - out0[1] = in0[3] >> 24 | in1[0] << 8; - out0[2] = in1[0] >> 24 | in1[1] << 8; - out0[3] = in1[1] >> 24 | in1[2] << 8; - out1[0] = in1[2] >> 24 | in1[3] << 8; - out1[1] = in1[3] >> 24; + case 11: out0[0] = amd_bytealign_S (in0[3], in0[2], 3); + out0[1] = amd_bytealign_S (in1[0], in0[3], 3); + out0[2] = amd_bytealign_S (in1[1], in1[0], 3); + out0[3] = amd_bytealign_S (in1[2], in1[1], 3); + out1[0] = amd_bytealign_S (in1[3], in1[2], 3); + out1[1] = amd_bytealign_S ( 0, in1[3], 3); out1[2] = 0; out1[3] = 0; break; @@ -578,30 +283,29 @@ static void lshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[2] = 0; out1[3] = 0; break; - case 13: - out0[0] = in0[3] >> 8 | in1[0] << 24; - out0[1] = in1[0] >> 8 | in1[1] << 24; - out0[2] = in1[1] >> 8 | in1[2] << 24; - out0[3] = in1[2] >> 8 | in1[3] << 24; - out1[0] = in1[3] >> 8; + case 13: out0[0] = amd_bytealign_S (in1[0], in0[3], 1); + out0[1] = amd_bytealign_S (in1[1], in1[0], 1); + out0[2] = amd_bytealign_S (in1[2], in1[1], 1); + out0[3] = amd_bytealign_S (in1[3], in1[2], 1); + out1[0] = amd_bytealign_S ( 0, in1[3], 1); out1[1] = 0; out1[2] = 0; out1[3] = 0; break; - case 14: out0[0] = in0[3] >> 16 | in1[0] << 16; - out0[1] = in1[0] >> 16 | in1[1] << 16; - out0[2] = in1[1] >> 16 | in1[2] << 16; - out0[3] = in1[2] >> 16 | in1[3] << 16; - out1[0] = in1[3] >> 16; + case 14: out0[0] = amd_bytealign_S (in1[0], in0[3], 2); + out0[1] = amd_bytealign_S (in1[1], in1[0], 2); + out0[2] = amd_bytealign_S (in1[2], in1[1], 2); + out0[3] = amd_bytealign_S (in1[3], in1[2], 2); + out1[0] = amd_bytealign_S ( 0, in1[3], 2); out1[1] = 0; out1[2] = 0; out1[3] = 0; break; - case 15: out0[0] = in0[3] >> 24 | in1[0] << 8; - out0[1] = in1[0] >> 24 | in1[1] << 8; - out0[2] = in1[1] >> 24 | in1[2] << 8; - out0[3] = in1[2] >> 24 | in1[3] << 8; - out1[0] = in1[3] >> 24; + case 15: out0[0] = amd_bytealign_S (in1[0], in0[3], 3); + out0[1] = amd_bytealign_S (in1[1], in1[0], 3); + out0[2] = amd_bytealign_S (in1[2], in1[1], 3); + out0[3] = amd_bytealign_S (in1[3], in1[2], 3); + out1[0] = amd_bytealign_S ( 0, in1[3], 3); out1[1] = 0; out1[2] = 0; out1[3] = 0; @@ -615,28 +319,28 @@ static void lshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[2] = 0; out1[3] = 0; break; - case 17: out0[0] = in1[0] >> 8 | in1[1] << 24; - out0[1] = in1[1] >> 8 | in1[2] << 24; - out0[2] = in1[2] >> 8 | in1[3] << 24; - out0[3] = in1[3] >> 8; + case 17: out0[0] = amd_bytealign_S (in1[1], in1[0], 1); + out0[1] = amd_bytealign_S (in1[2], in1[1], 1); + out0[2] = amd_bytealign_S (in1[3], in1[2], 1); + out0[3] = amd_bytealign_S ( 0, in1[3], 1); out1[0] = 0; out1[1] = 0; out1[2] = 0; out1[3] = 0; break; - case 18: out0[0] = in1[0] >> 16 | in1[1] << 16; - out0[1] = in1[1] >> 16 | in1[2] << 16; - out0[2] = in1[2] >> 16 | in1[3] << 16; - out0[3] = in1[3] >> 16; + case 18: out0[0] = amd_bytealign_S (in1[1], in1[0], 2); + out0[1] = amd_bytealign_S (in1[2], in1[1], 2); + out0[2] = amd_bytealign_S (in1[3], in1[2], 2); + out0[3] = amd_bytealign_S ( 0, in1[3], 2); out1[0] = 0; out1[1] = 0; out1[2] = 0; out1[3] = 0; break; - case 19: out0[0] = in1[0] >> 24 | in1[1] << 8; - out0[1] = in1[1] >> 24 | in1[2] << 8; - out0[2] = in1[2] >> 24 | in1[3] << 8; - out0[3] = in1[3] >> 24; + case 19: out0[0] = amd_bytealign_S (in1[1], in1[0], 3); + out0[1] = amd_bytealign_S (in1[2], in1[1], 3); + out0[2] = amd_bytealign_S (in1[3], in1[2], 3); + out0[3] = amd_bytealign_S ( 0, in1[3], 3); out1[0] = 0; out1[1] = 0; out1[2] = 0; @@ -651,27 +355,27 @@ static void lshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[2] = 0; out1[3] = 0; break; - case 21: out0[0] = in1[1] >> 8 | in1[2] << 24; - out0[1] = in1[2] >> 8 | in1[3] << 24; - out0[2] = in1[3] >> 8; + case 21: out0[0] = amd_bytealign_S (in1[2], in1[1], 1); + out0[1] = amd_bytealign_S (in1[3], in1[2], 1); + out0[2] = amd_bytealign_S ( 0, in1[3], 1); out0[3] = 0; out1[0] = 0; out1[1] = 0; out1[2] = 0; out1[3] = 0; break; - case 22: out0[0] = in1[1] >> 16 | in1[2] << 16; - out0[1] = in1[2] >> 16 | in1[3] << 16; - out0[2] = in1[3] >> 16; + case 22: out0[0] = amd_bytealign_S (in1[2], in1[1], 2); + out0[1] = amd_bytealign_S (in1[3], in1[2], 2); + out0[2] = amd_bytealign_S ( 0, in1[3], 2); out0[3] = 0; out1[0] = 0; out1[1] = 0; out1[2] = 0; out1[3] = 0; break; - case 23: out0[0] = in1[1] >> 24 | in1[2] << 8; - out0[1] = in1[2] >> 24 | in1[3] << 8; - out0[2] = in1[3] >> 24; + case 23: out0[0] = amd_bytealign_S (in1[2], in1[1], 3); + out0[1] = amd_bytealign_S (in1[3], in1[2], 3); + out0[2] = amd_bytealign_S ( 0, in1[3], 3); out0[3] = 0; out1[0] = 0; out1[1] = 0; @@ -687,8 +391,8 @@ static void lshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[2] = 0; out1[3] = 0; break; - case 25: out0[0] = in1[2] >> 8 | in1[3] << 24; - out0[1] = in1[3] >> 8; + case 25: out0[0] = amd_bytealign_S (in1[3], in1[2], 1); + out0[1] = amd_bytealign_S ( 0, in1[3], 1); out0[2] = 0; out0[3] = 0; out1[0] = 0; @@ -696,8 +400,8 @@ static void lshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[2] = 0; out1[3] = 0; break; - case 26: out0[0] = in1[2] >> 16 | in1[3] << 16; - out0[1] = in1[3] >> 16; + case 26: out0[0] = amd_bytealign_S (in1[3], in1[2], 2); + out0[1] = amd_bytealign_S ( 0, in1[3], 2); out0[2] = 0; out0[3] = 0; out1[0] = 0; @@ -705,8 +409,8 @@ static void lshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[2] = 0; out1[3] = 0; break; - case 27: out0[0] = in1[2] >> 24 | in1[3] << 8; - out0[1] = in1[3] >> 24; + case 27: out0[0] = amd_bytealign_S (in1[3], in1[2], 3); + out0[1] = amd_bytealign_S ( 0, in1[3], 3); out0[2] = 0; out0[3] = 0; out1[0] = 0; @@ -723,7 +427,7 @@ static void lshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[2] = 0; out1[3] = 0; break; - case 29: out0[0] = in1[3] >> 8; + case 29: out0[0] = amd_bytealign_S ( 0, in1[3], 1); out0[1] = 0; out0[2] = 0; out0[3] = 0; @@ -732,7 +436,7 @@ static void lshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[2] = 0; out1[3] = 0; break; - case 30: out0[0] = in1[3] >> 16; + case 30: out0[0] = amd_bytealign_S ( 0, in1[3], 2); out0[1] = 0; out0[2] = 0; out0[3] = 0; @@ -741,7 +445,7 @@ static void lshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[2] = 0; out1[3] = 0; break; - case 31: out0[0] = in1[3] >> 24; + case 31: out0[0] = amd_bytealign_S ( 0, in1[3], 3); out0[1] = 0; out0[2] = 0; out0[3] = 0; @@ -753,6 +457,301 @@ static void lshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 } } +static void rshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4], const u32 num) +{ + switch (num) + { + case 0: out1[3] = in1[3]; + out1[2] = in1[2]; + out1[1] = in1[1]; + out1[0] = in1[0]; + out0[3] = in0[3]; + out0[2] = in0[2]; + out0[1] = in0[1]; + out0[0] = in0[0]; + break; + case 1: out1[3] = amd_bytealign_S (in1[3], in1[2], 3); + out1[2] = amd_bytealign_S (in1[2], in1[1], 3); + out1[1] = amd_bytealign_S (in1[1], in1[0], 3); + out1[0] = amd_bytealign_S (in1[0], in0[3], 3); + out0[3] = amd_bytealign_S (in0[3], in0[2], 3); + out0[2] = amd_bytealign_S (in0[2], in0[1], 3); + out0[1] = amd_bytealign_S (in0[1], in0[0], 3); + out0[0] = amd_bytealign_S (in0[0], 0, 3); + break; + case 2: out1[3] = amd_bytealign_S (in1[3], in1[2], 2); + out1[2] = amd_bytealign_S (in1[2], in1[1], 2); + out1[1] = amd_bytealign_S (in1[1], in1[0], 2); + out1[0] = amd_bytealign_S (in1[0], in0[3], 2); + out0[3] = amd_bytealign_S (in0[3], in0[2], 2); + out0[2] = amd_bytealign_S (in0[2], in0[1], 2); + out0[1] = amd_bytealign_S (in0[1], in0[0], 2); + out0[0] = amd_bytealign_S (in0[0], 0, 2); + break; + case 3: out1[3] = amd_bytealign_S (in1[3], in1[2], 1); + out1[2] = amd_bytealign_S (in1[2], in1[1], 1); + out1[1] = amd_bytealign_S (in1[1], in1[0], 1); + out1[0] = amd_bytealign_S (in1[0], in0[3], 1); + out0[3] = amd_bytealign_S (in0[3], in0[2], 1); + out0[2] = amd_bytealign_S (in0[2], in0[1], 1); + out0[1] = amd_bytealign_S (in0[1], in0[0], 1); + out0[0] = amd_bytealign_S (in0[0], 0, 1); + break; + case 4: out1[3] = in1[2]; + out1[2] = in1[1]; + out1[1] = in1[0]; + out1[0] = in0[3]; + out0[3] = in0[2]; + out0[2] = in0[1]; + out0[1] = in0[0]; + out0[0] = 0; + break; + case 5: out1[3] = amd_bytealign_S (in1[2], in1[1], 3); + out1[2] = amd_bytealign_S (in1[1], in1[0], 3); + out1[1] = amd_bytealign_S (in1[0], in0[3], 3); + out1[0] = amd_bytealign_S (in0[3], in0[2], 3); + out0[3] = amd_bytealign_S (in0[2], in0[1], 3); + out0[2] = amd_bytealign_S (in0[1], in0[0], 3); + out0[1] = amd_bytealign_S (in0[0], 0, 3); + out0[0] = 0; + break; + case 6: out1[3] = amd_bytealign_S (in1[2], in1[1], 2); + out1[2] = amd_bytealign_S (in1[1], in1[0], 2); + out1[1] = amd_bytealign_S (in1[0], in0[3], 2); + out1[0] = amd_bytealign_S (in0[3], in0[2], 2); + out0[3] = amd_bytealign_S (in0[2], in0[1], 2); + out0[2] = amd_bytealign_S (in0[1], in0[0], 2); + out0[1] = amd_bytealign_S (in0[0], 0, 2); + out0[0] = 0; + break; + case 7: out1[3] = amd_bytealign_S (in1[2], in1[1], 1); + out1[2] = amd_bytealign_S (in1[1], in1[0], 1); + out1[1] = amd_bytealign_S (in1[0], in0[3], 1); + out1[0] = amd_bytealign_S (in0[3], in0[2], 1); + out0[3] = amd_bytealign_S (in0[2], in0[1], 1); + out0[2] = amd_bytealign_S (in0[1], in0[0], 1); + out0[1] = amd_bytealign_S (in0[0], 0, 1); + out0[0] = 0; + break; + case 8: out1[3] = in1[1]; + out1[2] = in1[0]; + out1[1] = in0[3]; + out1[0] = in0[2]; + out0[3] = in0[1]; + out0[2] = in0[0]; + out0[1] = 0; + out0[0] = 0; + break; + case 9: out1[3] = amd_bytealign_S (in1[1], in1[0], 3); + out1[2] = amd_bytealign_S (in1[0], in0[3], 3); + out1[1] = amd_bytealign_S (in0[3], in0[2], 3); + out1[0] = amd_bytealign_S (in0[2], in0[1], 3); + out0[3] = amd_bytealign_S (in0[1], in0[0], 3); + out0[2] = amd_bytealign_S (in0[0], 0, 3); + out0[1] = 0; + out0[0] = 0; + break; + case 10: out1[3] = amd_bytealign_S (in1[1], in1[0], 2); + out1[2] = amd_bytealign_S (in1[0], in0[3], 2); + out1[1] = amd_bytealign_S (in0[3], in0[2], 2); + out1[0] = amd_bytealign_S (in0[2], in0[1], 2); + out0[3] = amd_bytealign_S (in0[1], in0[0], 2); + out0[2] = amd_bytealign_S (in0[0], 0, 2); + out0[1] = 0; + out0[0] = 0; + break; + case 11: out1[3] = amd_bytealign_S (in1[1], in1[0], 1); + out1[2] = amd_bytealign_S (in1[0], in0[3], 1); + out1[1] = amd_bytealign_S (in0[3], in0[2], 1); + out1[0] = amd_bytealign_S (in0[2], in0[1], 1); + out0[3] = amd_bytealign_S (in0[1], in0[0], 1); + out0[2] = amd_bytealign_S (in0[0], 0, 1); + out0[1] = 0; + out0[0] = 0; + break; + case 12: out1[3] = in1[0]; + out1[2] = in0[3]; + out1[1] = in0[2]; + out1[0] = in0[1]; + out0[3] = in0[0]; + out0[2] = 0; + out0[1] = 0; + out0[0] = 0; + break; + case 13: out1[3] = amd_bytealign_S (in1[0], in0[3], 3); + out1[2] = amd_bytealign_S (in0[3], in0[2], 3); + out1[1] = amd_bytealign_S (in0[2], in0[1], 3); + out1[0] = amd_bytealign_S (in0[1], in0[0], 3); + out0[3] = amd_bytealign_S (in0[0], 0, 3); + out0[2] = 0; + out0[1] = 0; + out0[0] = 0; + break; + case 14: out1[3] = amd_bytealign_S (in1[0], in0[3], 2); + out1[2] = amd_bytealign_S (in0[3], in0[2], 2); + out1[1] = amd_bytealign_S (in0[2], in0[1], 2); + out1[0] = amd_bytealign_S (in0[1], in0[0], 2); + out0[3] = amd_bytealign_S (in0[0], 0, 2); + out0[2] = 0; + out0[1] = 0; + out0[0] = 0; + break; + case 15: out1[3] = amd_bytealign_S (in1[0], in0[3], 1); + out1[2] = amd_bytealign_S (in0[3], in0[2], 1); + out1[1] = amd_bytealign_S (in0[2], in0[1], 1); + out1[0] = amd_bytealign_S (in0[1], in0[0], 1); + out0[3] = amd_bytealign_S (in0[0], 0, 1); + out0[2] = 0; + out0[1] = 0; + out0[0] = 0; + break; + case 16: out1[3] = in0[3]; + out1[2] = in0[2]; + out1[1] = in0[1]; + out1[0] = in0[0]; + out0[3] = 0; + out0[2] = 0; + out0[1] = 0; + out0[0] = 0; + break; + case 17: out1[3] = amd_bytealign_S (in0[3], in0[2], 3); + out1[2] = amd_bytealign_S (in0[2], in0[1], 3); + out1[1] = amd_bytealign_S (in0[1], in0[0], 3); + out1[0] = amd_bytealign_S (in0[0], 0, 3); + out0[3] = 0; + out0[2] = 0; + out0[1] = 0; + out0[0] = 0; + break; + case 18: out1[3] = amd_bytealign_S (in0[3], in0[2], 2); + out1[2] = amd_bytealign_S (in0[2], in0[1], 2); + out1[1] = amd_bytealign_S (in0[1], in0[0], 2); + out1[0] = amd_bytealign_S (in0[0], 0, 2); + out0[3] = 0; + out0[2] = 0; + out0[1] = 0; + out0[0] = 0; + break; + case 19: out1[3] = amd_bytealign_S (in0[3], in0[2], 1); + out1[2] = amd_bytealign_S (in0[2], in0[1], 1); + out1[1] = amd_bytealign_S (in0[1], in0[0], 1); + out1[0] = amd_bytealign_S (in0[0], 0, 1); + out0[3] = 0; + out0[2] = 0; + out0[1] = 0; + out0[0] = 0; + break; + case 20: out1[3] = in0[2]; + out1[2] = in0[1]; + out1[1] = in0[0]; + out1[0] = 0; + out0[3] = 0; + out0[2] = 0; + out0[1] = 0; + out0[0] = 0; + break; + case 21: out1[3] = amd_bytealign_S (in0[2], in0[1], 3); + out1[2] = amd_bytealign_S (in0[1], in0[0], 3); + out1[1] = amd_bytealign_S (in0[0], 0, 3); + out1[0] = 0; + out0[3] = 0; + out0[2] = 0; + out0[1] = 0; + out0[0] = 0; + break; + case 22: out1[3] = amd_bytealign_S (in0[2], in0[1], 2); + out1[2] = amd_bytealign_S (in0[1], in0[0], 2); + out1[1] = amd_bytealign_S (in0[0], 0, 2); + out1[0] = 0; + out0[3] = 0; + out0[2] = 0; + out0[1] = 0; + out0[0] = 0; + break; + case 23: out1[3] = amd_bytealign_S (in0[2], in0[1], 1); + out1[2] = amd_bytealign_S (in0[1], in0[0], 1); + out1[1] = amd_bytealign_S (in0[0], 0, 1); + out1[0] = 0; + out0[3] = 0; + out0[2] = 0; + out0[1] = 0; + out0[0] = 0; + break; + case 24: out1[3] = in0[1]; + out1[2] = in0[0]; + out1[1] = 0; + out1[0] = 0; + out0[3] = 0; + out0[2] = 0; + out0[1] = 0; + out0[0] = 0; + break; + case 25: out1[3] = amd_bytealign_S (in0[1], in0[0], 3); + out1[2] = amd_bytealign_S (in0[0], 0, 3); + out1[1] = 0; + out1[0] = 0; + out0[3] = 0; + out0[2] = 0; + out0[1] = 0; + out0[0] = 0; + break; + case 26: out1[3] = amd_bytealign_S (in0[1], in0[0], 2); + out1[2] = amd_bytealign_S (in0[0], 0, 2); + out1[1] = 0; + out1[0] = 0; + out0[3] = 0; + out0[2] = 0; + out0[1] = 0; + out0[0] = 0; + break; + case 27: out1[3] = amd_bytealign_S (in0[1], in0[0], 1); + out1[2] = amd_bytealign_S (in0[0], 0, 1); + out1[1] = 0; + out1[0] = 0; + out0[3] = 0; + out0[2] = 0; + out0[1] = 0; + out0[0] = 0; + break; + case 28: out1[3] = in0[0]; + out1[2] = 0; + out1[1] = 0; + out1[0] = 0; + out0[3] = 0; + out0[2] = 0; + out0[1] = 0; + out0[0] = 0; + break; + case 29: out1[3] = amd_bytealign_S (in0[0], 0, 3); + out1[2] = 0; + out1[1] = 0; + out1[0] = 0; + out0[3] = 0; + out0[2] = 0; + out0[1] = 0; + out0[0] = 0; + break; + case 30: out1[3] = amd_bytealign_S (in0[0], 0, 2); + out1[2] = 0; + out1[1] = 0; + out1[0] = 0; + out0[3] = 0; + out0[2] = 0; + out0[1] = 0; + out0[0] = 0; + break; + case 31: out1[3] = amd_bytealign_S (in0[0], 0, 1); + out1[2] = 0; + out1[1] = 0; + out1[0] = 0; + out0[3] = 0; + out0[2] = 0; + out0[1] = 0; + out0[0] = 0; + break; + } +} + static void append_block1 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_r0) { // this version works with 1 byte append only @@ -775,10 +774,6 @@ static void append_block1 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 static void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0[4], const u32 src_l1[4], const u32 src_r0[4], const u32 src_r1[4]) { - const int offset_mod_4 = offset & 3; - - const int offset_minus_4 = 4 - offset; - u32 s0 = 0; u32 s1 = 0; u32 s2 = 0; @@ -787,64 +782,68 @@ static void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 u32 s5 = 0; u32 s6 = 0; u32 s7 = 0; - u32 s8 = 0; + + const u32 src_r00 = swap32_S (src_r0[0]); + const u32 src_r01 = swap32_S (src_r0[1]); + const u32 src_r02 = swap32_S (src_r0[2]); + const u32 src_r03 = swap32_S (src_r0[3]); + const u32 src_r10 = swap32_S (src_r1[0]); + const u32 src_r11 = swap32_S (src_r1[1]); + const u32 src_r12 = swap32_S (src_r1[2]); + const u32 src_r13 = swap32_S (src_r1[3]); switch (offset / 4) { case 0: - s8 = amd_bytealign ( 0, src_r1[3], offset_minus_4); - s7 = amd_bytealign (src_r1[3], src_r1[2], offset_minus_4); - s6 = amd_bytealign (src_r1[2], src_r1[1], offset_minus_4); - s5 = amd_bytealign (src_r1[1], src_r1[0], offset_minus_4); - s4 = amd_bytealign (src_r1[0], src_r0[3], offset_minus_4); - s3 = amd_bytealign (src_r0[3], src_r0[2], offset_minus_4); - s2 = amd_bytealign (src_r0[2], src_r0[1], offset_minus_4); - s1 = amd_bytealign (src_r0[1], src_r0[0], offset_minus_4); - s0 = amd_bytealign (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S (src_r12, src_r13, offset); + s6 = amd_bytealign_S (src_r11, src_r12, offset); + s5 = amd_bytealign_S (src_r10, src_r11, offset); + s4 = amd_bytealign_S (src_r03, src_r10, offset); + s3 = amd_bytealign_S (src_r02, src_r03, offset); + s2 = amd_bytealign_S (src_r01, src_r02, offset); + s1 = amd_bytealign_S (src_r00, src_r01, offset); + s0 = amd_bytealign_S ( 0, src_r00, offset); break; case 1: - s8 = amd_bytealign ( 0, src_r1[2], offset_minus_4); - s7 = amd_bytealign (src_r1[2], src_r1[1], offset_minus_4); - s6 = amd_bytealign (src_r1[1], src_r1[0], offset_minus_4); - s5 = amd_bytealign (src_r1[0], src_r0[3], offset_minus_4); - s4 = amd_bytealign (src_r0[3], src_r0[2], offset_minus_4); - s3 = amd_bytealign (src_r0[2], src_r0[1], offset_minus_4); - s2 = amd_bytealign (src_r0[1], src_r0[0], offset_minus_4); - s1 = amd_bytealign (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S (src_r11, src_r12, offset); + s6 = amd_bytealign_S (src_r10, src_r11, offset); + s5 = amd_bytealign_S (src_r03, src_r10, offset); + s4 = amd_bytealign_S (src_r02, src_r03, offset); + s3 = amd_bytealign_S (src_r01, src_r02, offset); + s2 = amd_bytealign_S (src_r00, src_r01, offset); + s1 = amd_bytealign_S ( 0, src_r00, offset); s0 = 0; break; case 2: - s8 = amd_bytealign ( 0, src_r1[1], offset_minus_4); - s7 = amd_bytealign (src_r1[1], src_r1[0], offset_minus_4); - s6 = amd_bytealign (src_r1[0], src_r0[3], offset_minus_4); - s5 = amd_bytealign (src_r0[3], src_r0[2], offset_minus_4); - s4 = amd_bytealign (src_r0[2], src_r0[1], offset_minus_4); - s3 = amd_bytealign (src_r0[1], src_r0[0], offset_minus_4); - s2 = amd_bytealign (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S (src_r10, src_r11, offset); + s6 = amd_bytealign_S (src_r03, src_r10, offset); + s5 = amd_bytealign_S (src_r02, src_r03, offset); + s4 = amd_bytealign_S (src_r01, src_r02, offset); + s3 = amd_bytealign_S (src_r00, src_r01, offset); + s2 = amd_bytealign_S ( 0, src_r00, offset); s1 = 0; s0 = 0; break; case 3: - s8 = amd_bytealign ( 0, src_r1[0], offset_minus_4); - s7 = amd_bytealign (src_r1[0], src_r0[3], offset_minus_4); - s6 = amd_bytealign (src_r0[3], src_r0[2], offset_minus_4); - s5 = amd_bytealign (src_r0[2], src_r0[1], offset_minus_4); - s4 = amd_bytealign (src_r0[1], src_r0[0], offset_minus_4); - s3 = amd_bytealign (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S (src_r03, src_r10, offset); + s6 = amd_bytealign_S (src_r02, src_r03, offset); + s5 = amd_bytealign_S (src_r01, src_r02, offset); + s4 = amd_bytealign_S (src_r00, src_r01, offset); + s3 = amd_bytealign_S ( 0, src_r00, offset); s2 = 0; s1 = 0; s0 = 0; + break; case 4: - s8 = amd_bytealign ( 0, src_r0[3], offset_minus_4); - s7 = amd_bytealign (src_r0[3], src_r0[2], offset_minus_4); - s6 = amd_bytealign (src_r0[2], src_r0[1], offset_minus_4); - s5 = amd_bytealign (src_r0[1], src_r0[0], offset_minus_4); - s4 = amd_bytealign (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S (src_r02, src_r03, offset); + s6 = amd_bytealign_S (src_r01, src_r02, offset); + s5 = amd_bytealign_S (src_r00, src_r01, offset); + s4 = amd_bytealign_S ( 0, src_r00, offset); s3 = 0; s2 = 0; s1 = 0; @@ -852,10 +851,9 @@ static void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 break; case 5: - s8 = amd_bytealign ( 0, src_r0[2], offset_minus_4); - s7 = amd_bytealign (src_r0[2], src_r0[1], offset_minus_4); - s6 = amd_bytealign (src_r0[1], src_r0[0], offset_minus_4); - s5 = amd_bytealign (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S (src_r01, src_r02, offset); + s6 = amd_bytealign_S (src_r00, src_r01, offset); + s5 = amd_bytealign_S ( 0, src_r00, offset); s4 = 0; s3 = 0; s2 = 0; @@ -864,9 +862,8 @@ static void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 break; case 6: - s8 = amd_bytealign ( 0, src_r0[1], offset_minus_4); - s7 = amd_bytealign (src_r0[1], src_r0[0], offset_minus_4); - s6 = amd_bytealign (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S (src_r00, src_r01, offset); + s6 = amd_bytealign_S ( 0, src_r00, offset); s5 = 0; s4 = 0; s3 = 0; @@ -876,8 +873,7 @@ static void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 break; case 7: - s8 = amd_bytealign ( 0, src_r0[0], offset_minus_4); - s7 = amd_bytealign (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S ( 0, src_r00, offset); s6 = 0; s5 = 0; s4 = 0; @@ -888,28 +884,23 @@ static void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 break; } - if (offset_mod_4 == 0) - { - buf0[0] = src_l0[0] | s1; - buf0[1] = src_l0[1] | s2; - buf0[2] = src_l0[2] | s3; - buf0[3] = src_l0[3] | s4; - buf1[0] = src_l1[0] | s5; - buf1[1] = src_l1[1] | s6; - buf1[2] = src_l1[2] | s7; - buf1[3] = src_l1[3] | s8; - } - else - { - buf0[0] = src_l0[0] | s0; - buf0[1] = src_l0[1] | s1; - buf0[2] = src_l0[2] | s2; - buf0[3] = src_l0[3] | s3; - buf1[0] = src_l1[0] | s4; - buf1[1] = src_l1[1] | s5; - buf1[2] = src_l1[2] | s6; - buf1[3] = src_l1[3] | s7; - } + s0 = swap32_S (s0); + s1 = swap32_S (s1); + s2 = swap32_S (s2); + s3 = swap32_S (s3); + s4 = swap32_S (s4); + s5 = swap32_S (s5); + s6 = swap32_S (s6); + s7 = swap32_S (s7); + + buf0[0] = src_l0[0] | s0; + buf0[1] = src_l0[1] | s1; + buf0[2] = src_l0[2] | s2; + buf0[3] = src_l0[3] | s3; + buf1[0] = src_l1[0] | s4; + buf1[1] = src_l1[1] | s5; + buf1[2] = src_l1[2] | s6; + buf1[3] = src_l1[3] | s7; } static void reverse_block (u32 in0[4], u32 in1[4], u32 out0[4], u32 out1[4], const u32 len) @@ -1032,19 +1023,7 @@ static u32 rule_op_mangle_dupeword (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED cons u32 out_len = in_len; - u32 tib40[4]; - u32 tib41[4]; - - tib40[0] = buf0[0]; - tib40[1] = buf0[1]; - tib40[2] = buf0[2]; - tib40[3] = buf0[3]; - tib41[0] = buf1[0]; - tib41[1] = buf1[1]; - tib41[2] = buf1[2]; - tib41[3] = buf1[3]; - - append_block8 (out_len, buf0, buf1, buf0, buf1, tib40, tib41); + append_block8 (out_len, buf0, buf1, buf0, buf1, buf0, buf1); out_len += in_len; @@ -1149,7 +1128,7 @@ static u32 rule_op_mangle_rotate_left (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED c return in_len; } -static u32 rule_op_mangle_rotate_right (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +static u32 rule_op_mangle_rotate_right (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if (in_len == 0) return (in_len); @@ -1221,7 +1200,7 @@ static u32 rule_op_mangle_delete_at (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED con lshift_block (buf0, buf1, tib40, tib41); - const u32 ml = (1u << ((p0 & 3) * 8)) - 1; + const u32 ml = (1 << ((p0 & 3) * 8)) - 1; const u32 mr = ~ml; switch (p0 / 4) @@ -1322,7 +1301,7 @@ static u32 rule_op_mangle_omit (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u3 lshift_block_N (buf0, buf1, tib40, tib41, p1); - const u32 ml = (1u << ((p0 & 3) * 8)) - 1; + const u32 ml = (1 << ((p0 & 3) * 8)) - 1; const u32 mr = ~ml; switch (p0 / 4) @@ -1401,7 +1380,7 @@ static u32 rule_op_mangle_insert (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const const u32 p1n = p1 << ((p0 & 3) * 8); - const u32 ml = (1u << ((p0 & 3) * 8)) - 1; + const u32 ml = (1 << ((p0 & 3) * 8)) - 1; const u32 mr = 0xffffff00 << ((p0 & 3) * 8); @@ -1538,7 +1517,7 @@ static u32 rule_op_mangle_purgechar (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED con { u32 out_len = 0; - u32 buf_in[8] = { 0 }; + u32 buf_in[8]; buf_in[0] = buf0[0]; buf_in[1] = buf0[1]; @@ -1575,14 +1554,6 @@ static u32 rule_op_mangle_purgechar (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED con return out_len; } -/* -static u32 rule_op_mangle_togglecase_rec (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) -{ - // TODO - return in_len; -} -*/ - static u32 rule_op_mangle_dupechar_first (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len) { if ( in_len == 0) return (in_len); @@ -2568,7 +2539,7 @@ u32 apply_rule_optimized (const u32 name, const u32 p0, const u32 p1, u32 buf0[4 return out_len; } -u32 apply_rules_optimized (u32 *cmds, u32 buf0[4], u32 buf1[4], const u32 len) +u32 apply_rules_optimized (const u32 *cmds, u32 buf0[4], u32 buf1[4], const u32 len) { u32 out_len = len;