diff --git a/OpenCL/inc_rp.cl b/OpenCL/inc_rp.cl index a1d7c8c07..81832e137 100644 --- a/OpenCL/inc_rp.cl +++ b/OpenCL/inc_rp.cl @@ -3,123 +3,119 @@ * License.....: MIT */ -inline u32 apply_rule (const u32 name, const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len); -inline u32 apply_rules (const __global u32 *cmds, u32 buf0[4], u32 buf1[4], const u32 len); -inline u32x apply_rules_vect (const u32 pw_buf0[4], const u32 pw_buf1[4], const u32 pw_len, const __global kernel_rule_t *rules_buf, const u32 il_pos, u32x w0[4], u32x w1[4]); - -inline u32 generate_cmask (u32 buf) +inline u32 generate_cmask (const u32 value) { - const u32 rmask = ((buf & 0x40404040) >> 1) - & ~((buf & 0x80808080) >> 2); + const u32 rmask = ((value & 0x40404040u) >> 1u) + & ~((value & 0x80808080u) >> 2u); - const u32 hmask = (buf & 0x1f1f1f1f) + 0x05050505; - const u32 lmask = (buf & 0x1f1f1f1f) + 0x1f1f1f1f; + const u32 hmask = (value & 0x1f1f1f1fu) + 0x05050505u; + const u32 lmask = (value & 0x1f1f1f1fu) + 0x1f1f1f1fu; return rmask & ~hmask & lmask; } -inline void truncate_right (u32 w0[4], u32 w1[4], const u32 len) +inline void truncate_right (u32 buf0[4], u32 buf1[4], const u32 offset) { - const u32 tmp = (1 << ((len % 4) * 8)) - 1; + const u32 tmp = (1u << ((offset & 3u) * 8u)) - 1u; - switch (len / 4) + switch (offset / 4) { - case 0: w0[0] &= tmp; - w0[1] = 0; - w0[2] = 0; - w0[3] = 0; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; + case 0: buf0[0] &= tmp; + buf0[1] = 0; + buf0[2] = 0; + buf0[3] = 0; + buf1[0] = 0; + buf1[1] = 0; + buf1[2] = 0; + buf1[3] = 0; break; - case 1: w0[1] &= tmp; - w0[2] = 0; - w0[3] = 0; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; + case 1: buf0[1] &= tmp; + buf0[2] = 0; + buf0[3] = 0; + buf1[0] = 0; + buf1[1] = 0; + buf1[2] = 0; + buf1[3] = 0; break; - case 2: w0[2] &= tmp; - w0[3] = 0; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; + case 2: buf0[2] &= tmp; + buf0[3] = 0; + buf1[0] = 0; + buf1[1] = 0; + buf1[2] = 0; + buf1[3] = 0; break; - case 3: w0[3] &= tmp; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; + case 3: buf0[3] &= tmp; + buf1[0] = 0; + buf1[1] = 0; + buf1[2] = 0; + buf1[3] = 0; break; - case 4: w1[0] &= tmp; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; + case 4: buf1[0] &= tmp; + buf1[1] = 0; + buf1[2] = 0; + buf1[3] = 0; break; - case 5: w1[1] &= tmp; - w1[2] = 0; - w1[3] = 0; + case 5: buf1[1] &= tmp; + buf1[2] = 0; + buf1[3] = 0; break; - case 6: w1[2] &= tmp; - w1[3] = 0; + case 6: buf1[2] &= tmp; + buf1[3] = 0; break; - case 7: w1[3] &= tmp; + case 7: buf1[3] &= tmp; break; } } -inline void truncate_left (u32 w0[4], u32 w1[4], const u32 len) +inline void truncate_left (u32 buf0[4], u32 buf1[4], const u32 offset) { - const u32 tmp = ~((1 << ((len % 4) * 8)) - 1); + const u32 tmp = ~((1u << ((offset & 3u) * 8u)) - 1u); - switch (len / 4) + switch (offset / 4) { - case 0: w0[0] &= tmp; + case 0: buf0[0] &= tmp; break; - case 1: w0[0] = 0; - w0[1] &= tmp; + case 1: buf0[0] = 0; + buf0[1] &= tmp; break; - case 2: w0[0] = 0; - w0[1] = 0; - w0[2] &= tmp; + case 2: buf0[0] = 0; + buf0[1] = 0; + buf0[2] &= tmp; break; - case 3: w0[0] = 0; - w0[1] = 0; - w0[2] = 0; - w0[3] &= tmp; + case 3: buf0[0] = 0; + buf0[1] = 0; + buf0[2] = 0; + buf0[3] &= tmp; break; - case 4: w0[0] = 0; - w0[1] = 0; - w0[2] = 0; - w0[3] = 0; - w1[0] &= tmp; + case 4: buf0[0] = 0; + buf0[1] = 0; + buf0[2] = 0; + buf0[3] = 0; + buf1[0] &= tmp; break; - case 5: w0[0] = 0; - w0[1] = 0; - w0[2] = 0; - w0[3] = 0; - w1[0] = 0; - w1[1] &= tmp; + case 5: buf0[0] = 0; + buf0[1] = 0; + buf0[2] = 0; + buf0[3] = 0; + buf1[0] = 0; + buf1[1] &= tmp; break; - case 6: w0[0] = 0; - w0[1] = 0; - w0[2] = 0; - w0[3] = 0; - w1[0] = 0; - w1[1] = 0; - w1[2] &= tmp; + case 6: buf0[0] = 0; + buf0[1] = 0; + buf0[2] = 0; + buf0[3] = 0; + buf1[0] = 0; + buf1[1] = 0; + buf1[2] &= tmp; break; - case 7: w0[0] = 0; - w0[1] = 0; - w0[2] = 0; - w0[3] = 0; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] &= tmp; + case 7: buf0[0] = 0; + buf0[1] = 0; + buf0[2] = 0; + buf0[3] = 0; + buf1[0] = 0; + buf1[1] = 0; + buf1[2] = 0; + buf1[3] &= tmp; break; } } @@ -738,23 +734,27 @@ inline void rshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 } } -inline void append_block1 (const u32 offset, u32 dst0[4], u32 dst1[4], const u32 src_r0) +inline void append_block1 (const u32 offset, u32 buf0[4], u32 dst1[4], const u32 src_r0) { // this version works with 1 byte append only - const u32 tmp = (src_r0 & 0xff) << ((offset & 3) * 8); + const u32 value = src_r0 & 0xff; - dst0[0] |= (offset < 4) ? tmp : 0; - dst0[1] |= ((offset >= 4) && (offset < 8)) ? tmp : 0; - dst0[2] |= ((offset >= 8) && (offset < 12)) ? tmp : 0; - dst0[3] |= ((offset >= 12) && (offset < 16)) ? tmp : 0; + const u32 shift = (offset & 3) * 8; + + const u32 tmp = value << shift; + + buf0[0] |= (offset < 4) ? tmp : 0; + buf0[1] |= ((offset >= 4) && (offset < 8)) ? tmp : 0; + buf0[2] |= ((offset >= 8) && (offset < 12)) ? tmp : 0; + buf0[3] |= ((offset >= 12) && (offset < 16)) ? tmp : 0; dst1[0] |= ((offset >= 16) && (offset < 20)) ? tmp : 0; dst1[1] |= ((offset >= 20) && (offset < 24)) ? tmp : 0; dst1[2] |= ((offset >= 24) && (offset < 28)) ? tmp : 0; dst1[3] |= (offset >= 28) ? tmp : 0; } -inline void append_block8 (const u32 offset, u32 dst0[4], u32 dst1[4], const u32 src_l0[4], const u32 src_l1[4], const u32 src_r0[4], const u32 src_r1[4]) +inline void append_block8 (const u32 offset, u32 buf0[4], u32 dst1[4], const u32 src_l0[4], const u32 src_l1[4], const u32 src_r0[4], const u32 src_r1[4]) { switch (offset) { @@ -835,136 +835,136 @@ inline void append_block8 (const u32 offset, u32 dst0[4], u32 dst1[4], const u32 dst1[2] = amd_bytealign_S (src_r0[3], src_r0[2], 1); dst1[1] = amd_bytealign_S (src_r0[2], src_r0[1], 1); dst1[0] = amd_bytealign_S (src_r0[1], src_r0[0], 1); - dst0[3] = src_l0[3] | src_r0[0] << 24; + buf0[3] = src_l0[3] | src_r0[0] << 24; break; case 14: dst1[3] = amd_bytealign_S (src_r1[0], src_r0[3], 2); dst1[2] = amd_bytealign_S (src_r0[3], src_r0[2], 2); dst1[1] = amd_bytealign_S (src_r0[2], src_r0[1], 2); dst1[0] = amd_bytealign_S (src_r0[1], src_r0[0], 2); - dst0[3] = src_l0[3] | src_r0[0] << 16; + buf0[3] = src_l0[3] | src_r0[0] << 16; break; case 13: dst1[3] = amd_bytealign_S (src_r1[0], src_r0[3], 3); dst1[2] = amd_bytealign_S (src_r0[3], src_r0[2], 3); dst1[1] = amd_bytealign_S (src_r0[2], src_r0[1], 3); dst1[0] = amd_bytealign_S (src_r0[1], src_r0[0], 3); - dst0[3] = src_l0[3] | src_r0[0] << 8; + buf0[3] = src_l0[3] | src_r0[0] << 8; break; case 12: dst1[3] = src_r1[0]; dst1[2] = src_r0[3]; dst1[1] = src_r0[2]; dst1[0] = src_r0[1]; - dst0[3] = src_r0[0]; + buf0[3] = src_r0[0]; break; case 11: dst1[3] = amd_bytealign_S (src_r1[1], src_r1[0], 1); dst1[2] = amd_bytealign_S (src_r1[0], src_r0[3], 1); dst1[1] = amd_bytealign_S (src_r0[3], src_r0[2], 1); dst1[0] = amd_bytealign_S (src_r0[2], src_r0[1], 1); - dst0[3] = amd_bytealign_S (src_r0[1], src_r0[0], 1); - dst0[2] = src_l0[2] | src_r0[0] << 24; + buf0[3] = amd_bytealign_S (src_r0[1], src_r0[0], 1); + buf0[2] = src_l0[2] | src_r0[0] << 24; break; case 10: dst1[3] = amd_bytealign_S (src_r1[1], src_r1[0], 2); dst1[2] = amd_bytealign_S (src_r1[0], src_r0[3], 2); dst1[1] = amd_bytealign_S (src_r0[3], src_r0[2], 2); dst1[0] = amd_bytealign_S (src_r0[2], src_r0[1], 2); - dst0[3] = amd_bytealign_S (src_r0[1], src_r0[0], 2); - dst0[2] = src_l0[2] | src_r0[0] << 16; + buf0[3] = amd_bytealign_S (src_r0[1], src_r0[0], 2); + buf0[2] = src_l0[2] | src_r0[0] << 16; break; case 9: dst1[3] = amd_bytealign_S (src_r1[1], src_r1[0], 3); dst1[2] = amd_bytealign_S (src_r1[0], src_r0[3], 3); dst1[1] = amd_bytealign_S (src_r0[3], src_r0[2], 3); dst1[0] = amd_bytealign_S (src_r0[2], src_r0[1], 3); - dst0[3] = amd_bytealign_S (src_r0[1], src_r0[0], 3); - dst0[2] = src_l0[2] | src_r0[0] << 8; + buf0[3] = amd_bytealign_S (src_r0[1], src_r0[0], 3); + buf0[2] = src_l0[2] | src_r0[0] << 8; break; case 8: dst1[3] = src_r1[1]; dst1[2] = src_r1[0]; dst1[1] = src_r0[3]; dst1[0] = src_r0[2]; - dst0[3] = src_r0[1]; - dst0[2] = src_r0[0]; + buf0[3] = src_r0[1]; + buf0[2] = src_r0[0]; break; case 7: dst1[3] = amd_bytealign_S (src_r1[2], src_r1[1], 1); dst1[2] = amd_bytealign_S (src_r1[1], src_r1[0], 1); dst1[1] = amd_bytealign_S (src_r1[0], src_r0[3], 1); dst1[0] = amd_bytealign_S (src_r0[3], src_r0[2], 1); - dst0[3] = amd_bytealign_S (src_r0[2], src_r0[1], 1); - dst0[2] = amd_bytealign_S (src_r0[1], src_r0[0], 1); - dst0[1] = src_l0[1] | src_r0[0] << 24; + buf0[3] = amd_bytealign_S (src_r0[2], src_r0[1], 1); + buf0[2] = amd_bytealign_S (src_r0[1], src_r0[0], 1); + buf0[1] = src_l0[1] | src_r0[0] << 24; break; case 6: dst1[3] = amd_bytealign_S (src_r1[2], src_r1[1], 2); dst1[2] = amd_bytealign_S (src_r1[1], src_r1[0], 2); dst1[1] = amd_bytealign_S (src_r1[0], src_r0[3], 2); dst1[0] = amd_bytealign_S (src_r0[3], src_r0[2], 2); - dst0[3] = amd_bytealign_S (src_r0[2], src_r0[1], 2); - dst0[2] = amd_bytealign_S (src_r0[1], src_r0[0], 2); - dst0[1] = src_l0[1] | src_r0[0] << 16; + buf0[3] = amd_bytealign_S (src_r0[2], src_r0[1], 2); + buf0[2] = amd_bytealign_S (src_r0[1], src_r0[0], 2); + buf0[1] = src_l0[1] | src_r0[0] << 16; break; case 5: dst1[3] = amd_bytealign_S (src_r1[2], src_r1[1], 3); dst1[2] = amd_bytealign_S (src_r1[1], src_r1[0], 3); dst1[1] = amd_bytealign_S (src_r1[0], src_r0[3], 3); dst1[0] = amd_bytealign_S (src_r0[3], src_r0[2], 3); - dst0[3] = amd_bytealign_S (src_r0[2], src_r0[1], 3); - dst0[2] = amd_bytealign_S (src_r0[1], src_r0[0], 3); - dst0[1] = src_l0[1] | src_r0[0] << 8; + buf0[3] = amd_bytealign_S (src_r0[2], src_r0[1], 3); + buf0[2] = amd_bytealign_S (src_r0[1], src_r0[0], 3); + buf0[1] = src_l0[1] | src_r0[0] << 8; break; case 4: dst1[3] = src_r1[2]; dst1[2] = src_r1[1]; dst1[1] = src_r1[0]; dst1[0] = src_r0[3]; - dst0[3] = src_r0[2]; - dst0[2] = src_r0[1]; - dst0[1] = src_r0[0]; + buf0[3] = src_r0[2]; + buf0[2] = src_r0[1]; + buf0[1] = src_r0[0]; break; case 3: dst1[3] = amd_bytealign_S (src_r1[3], src_r1[2], 1); dst1[2] = amd_bytealign_S (src_r1[2], src_r1[1], 1); dst1[1] = amd_bytealign_S (src_r1[1], src_r1[0], 1); dst1[0] = amd_bytealign_S (src_r1[0], src_r0[3], 1); - dst0[3] = amd_bytealign_S (src_r0[3], src_r0[2], 1); - dst0[2] = amd_bytealign_S (src_r0[2], src_r0[1], 1); - dst0[1] = amd_bytealign_S (src_r0[1], src_r0[0], 1); - dst0[0] = src_l0[0] | src_r0[0] << 24; + buf0[3] = amd_bytealign_S (src_r0[3], src_r0[2], 1); + buf0[2] = amd_bytealign_S (src_r0[2], src_r0[1], 1); + buf0[1] = amd_bytealign_S (src_r0[1], src_r0[0], 1); + buf0[0] = src_l0[0] | src_r0[0] << 24; break; case 2: dst1[3] = amd_bytealign_S (src_r1[3], src_r1[2], 2); dst1[2] = amd_bytealign_S (src_r1[2], src_r1[1], 2); dst1[1] = amd_bytealign_S (src_r1[1], src_r1[0], 2); dst1[0] = amd_bytealign_S (src_r1[0], src_r0[3], 2); - dst0[3] = amd_bytealign_S (src_r0[3], src_r0[2], 2); - dst0[2] = amd_bytealign_S (src_r0[2], src_r0[1], 2); - dst0[1] = amd_bytealign_S (src_r0[1], src_r0[0], 2); - dst0[0] = src_l0[0] | src_r0[0] << 16; + buf0[3] = amd_bytealign_S (src_r0[3], src_r0[2], 2); + buf0[2] = amd_bytealign_S (src_r0[2], src_r0[1], 2); + buf0[1] = amd_bytealign_S (src_r0[1], src_r0[0], 2); + buf0[0] = src_l0[0] | src_r0[0] << 16; break; case 1: dst1[3] = amd_bytealign_S (src_r1[3], src_r1[2], 3); dst1[2] = amd_bytealign_S (src_r1[2], src_r1[1], 3); dst1[1] = amd_bytealign_S (src_r1[1], src_r1[0], 3); dst1[0] = amd_bytealign_S (src_r1[0], src_r0[3], 3); - dst0[3] = amd_bytealign_S (src_r0[3], src_r0[2], 3); - dst0[2] = amd_bytealign_S (src_r0[2], src_r0[1], 3); - dst0[1] = amd_bytealign_S (src_r0[1], src_r0[0], 3); - dst0[0] = src_l0[0] | src_r0[0] << 8; + buf0[3] = amd_bytealign_S (src_r0[3], src_r0[2], 3); + buf0[2] = amd_bytealign_S (src_r0[2], src_r0[1], 3); + buf0[1] = amd_bytealign_S (src_r0[1], src_r0[0], 3); + buf0[0] = src_l0[0] | src_r0[0] << 8; break; case 0: dst1[3] = src_r1[3]; dst1[2] = src_r1[2]; dst1[1] = src_r1[1]; dst1[0] = src_r1[0]; - dst0[3] = src_r0[3]; - dst0[2] = src_r0[2]; - dst0[1] = src_r0[1]; - dst0[0] = src_r0[0]; + buf0[3] = src_r0[3]; + buf0[2] = src_r0[2]; + buf0[1] = src_r0[1]; + buf0[0] = src_r0[0]; break; } } @@ -2531,20 +2531,20 @@ inline u32 apply_rules (const __global u32 *cmds, u32 buf0[4], u32 buf1[4], cons return out_len; } -inline u32x apply_rules_vect (const u32 pw_buf0[4], const u32 pw_buf1[4], const u32 pw_len, const __global kernel_rule_t *rules_buf, const u32 il_pos, u32x w0[4], u32x w1[4]) +inline u32x apply_rules_vect (const u32 pw_buf0[4], const u32 pw_buf1[4], const u32 pw_len, const __global kernel_rule_t *rules_buf, const u32 il_pos, u32x buf0[4], u32x buf1[4]) { #if VECT_SIZE == 1 - w0[0] = pw_buf0[0]; - w0[1] = pw_buf0[1]; - w0[2] = pw_buf0[2]; - w0[3] = pw_buf0[3]; - w1[0] = pw_buf1[0]; - w1[1] = pw_buf1[1]; - w1[2] = pw_buf1[2]; - w1[3] = pw_buf1[3]; + buf0[0] = pw_buf0[0]; + buf0[1] = pw_buf0[1]; + buf0[2] = pw_buf0[2]; + buf0[3] = pw_buf0[3]; + buf1[0] = pw_buf1[0]; + buf1[1] = pw_buf1[1]; + buf1[2] = pw_buf1[2]; + buf1[3] = pw_buf1[3]; - return apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); + return apply_rules (rules_buf[il_pos].cmds, buf0, buf1, pw_len); #else @@ -2573,200 +2573,200 @@ inline u32x apply_rules_vect (const u32 pw_buf0[4], const u32 pw_buf1[4], const { #if VECT_SIZE >= 2 case 0: - w0[0].s0 = tmp0[0]; - w0[1].s0 = tmp0[1]; - w0[2].s0 = tmp0[2]; - w0[3].s0 = tmp0[3]; - w1[0].s0 = tmp1[0]; - w1[1].s0 = tmp1[1]; - w1[2].s0 = tmp1[2]; - w1[3].s0 = tmp1[3]; + buf0[0].s0 = tmp0[0]; + buf0[1].s0 = tmp0[1]; + buf0[2].s0 = tmp0[2]; + buf0[3].s0 = tmp0[3]; + buf1[0].s0 = tmp1[0]; + buf1[1].s0 = tmp1[1]; + buf1[2].s0 = tmp1[2]; + buf1[3].s0 = tmp1[3]; out_len.s0 = tmp_len; break; case 1: - w0[0].s1 = tmp0[0]; - w0[1].s1 = tmp0[1]; - w0[2].s1 = tmp0[2]; - w0[3].s1 = tmp0[3]; - w1[0].s1 = tmp1[0]; - w1[1].s1 = tmp1[1]; - w1[2].s1 = tmp1[2]; - w1[3].s1 = tmp1[3]; + buf0[0].s1 = tmp0[0]; + buf0[1].s1 = tmp0[1]; + buf0[2].s1 = tmp0[2]; + buf0[3].s1 = tmp0[3]; + buf1[0].s1 = tmp1[0]; + buf1[1].s1 = tmp1[1]; + buf1[2].s1 = tmp1[2]; + buf1[3].s1 = tmp1[3]; out_len.s1 = tmp_len; break; #endif #if VECT_SIZE >= 4 case 2: - w0[0].s2 = tmp0[0]; - w0[1].s2 = tmp0[1]; - w0[2].s2 = tmp0[2]; - w0[3].s2 = tmp0[3]; - w1[0].s2 = tmp1[0]; - w1[1].s2 = tmp1[1]; - w1[2].s2 = tmp1[2]; - w1[3].s2 = tmp1[3]; + buf0[0].s2 = tmp0[0]; + buf0[1].s2 = tmp0[1]; + buf0[2].s2 = tmp0[2]; + buf0[3].s2 = tmp0[3]; + buf1[0].s2 = tmp1[0]; + buf1[1].s2 = tmp1[1]; + buf1[2].s2 = tmp1[2]; + buf1[3].s2 = tmp1[3]; out_len.s2 = tmp_len; break; case 3: - w0[0].s3 = tmp0[0]; - w0[1].s3 = tmp0[1]; - w0[2].s3 = tmp0[2]; - w0[3].s3 = tmp0[3]; - w1[0].s3 = tmp1[0]; - w1[1].s3 = tmp1[1]; - w1[2].s3 = tmp1[2]; - w1[3].s3 = tmp1[3]; + buf0[0].s3 = tmp0[0]; + buf0[1].s3 = tmp0[1]; + buf0[2].s3 = tmp0[2]; + buf0[3].s3 = tmp0[3]; + buf1[0].s3 = tmp1[0]; + buf1[1].s3 = tmp1[1]; + buf1[2].s3 = tmp1[2]; + buf1[3].s3 = tmp1[3]; out_len.s3 = tmp_len; break; #endif #if VECT_SIZE >= 8 case 4: - w0[0].s4 = tmp0[0]; - w0[1].s4 = tmp0[1]; - w0[2].s4 = tmp0[2]; - w0[3].s4 = tmp0[3]; - w1[0].s4 = tmp1[0]; - w1[1].s4 = tmp1[1]; - w1[2].s4 = tmp1[2]; - w1[3].s4 = tmp1[3]; + buf0[0].s4 = tmp0[0]; + buf0[1].s4 = tmp0[1]; + buf0[2].s4 = tmp0[2]; + buf0[3].s4 = tmp0[3]; + buf1[0].s4 = tmp1[0]; + buf1[1].s4 = tmp1[1]; + buf1[2].s4 = tmp1[2]; + buf1[3].s4 = tmp1[3]; out_len.s4 = tmp_len; break; case 5: - w0[0].s5 = tmp0[0]; - w0[1].s5 = tmp0[1]; - w0[2].s5 = tmp0[2]; - w0[3].s5 = tmp0[3]; - w1[0].s5 = tmp1[0]; - w1[1].s5 = tmp1[1]; - w1[2].s5 = tmp1[2]; - w1[3].s5 = tmp1[3]; + buf0[0].s5 = tmp0[0]; + buf0[1].s5 = tmp0[1]; + buf0[2].s5 = tmp0[2]; + buf0[3].s5 = tmp0[3]; + buf1[0].s5 = tmp1[0]; + buf1[1].s5 = tmp1[1]; + buf1[2].s5 = tmp1[2]; + buf1[3].s5 = tmp1[3]; out_len.s5 = tmp_len; break; case 6: - w0[0].s6 = tmp0[0]; - w0[1].s6 = tmp0[1]; - w0[2].s6 = tmp0[2]; - w0[3].s6 = tmp0[3]; - w1[0].s6 = tmp1[0]; - w1[1].s6 = tmp1[1]; - w1[2].s6 = tmp1[2]; - w1[3].s6 = tmp1[3]; + buf0[0].s6 = tmp0[0]; + buf0[1].s6 = tmp0[1]; + buf0[2].s6 = tmp0[2]; + buf0[3].s6 = tmp0[3]; + buf1[0].s6 = tmp1[0]; + buf1[1].s6 = tmp1[1]; + buf1[2].s6 = tmp1[2]; + buf1[3].s6 = tmp1[3]; out_len.s6 = tmp_len; break; case 7: - w0[0].s7 = tmp0[0]; - w0[1].s7 = tmp0[1]; - w0[2].s7 = tmp0[2]; - w0[3].s7 = tmp0[3]; - w1[0].s7 = tmp1[0]; - w1[1].s7 = tmp1[1]; - w1[2].s7 = tmp1[2]; - w1[3].s7 = tmp1[3]; + buf0[0].s7 = tmp0[0]; + buf0[1].s7 = tmp0[1]; + buf0[2].s7 = tmp0[2]; + buf0[3].s7 = tmp0[3]; + buf1[0].s7 = tmp1[0]; + buf1[1].s7 = tmp1[1]; + buf1[2].s7 = tmp1[2]; + buf1[3].s7 = tmp1[3]; out_len.s7 = tmp_len; break; #endif #if VECT_SIZE >= 16 case 8: - w0[0].s8 = tmp0[0]; - w0[1].s8 = tmp0[1]; - w0[2].s8 = tmp0[2]; - w0[3].s8 = tmp0[3]; - w1[0].s8 = tmp1[0]; - w1[1].s8 = tmp1[1]; - w1[2].s8 = tmp1[2]; - w1[3].s8 = tmp1[3]; + buf0[0].s8 = tmp0[0]; + buf0[1].s8 = tmp0[1]; + buf0[2].s8 = tmp0[2]; + buf0[3].s8 = tmp0[3]; + buf1[0].s8 = tmp1[0]; + buf1[1].s8 = tmp1[1]; + buf1[2].s8 = tmp1[2]; + buf1[3].s8 = tmp1[3]; out_len.s8 = tmp_len; break; case 9: - w0[0].s9 = tmp0[0]; - w0[1].s9 = tmp0[1]; - w0[2].s9 = tmp0[2]; - w0[3].s9 = tmp0[3]; - w1[0].s9 = tmp1[0]; - w1[1].s9 = tmp1[1]; - w1[2].s9 = tmp1[2]; - w1[3].s9 = tmp1[3]; + buf0[0].s9 = tmp0[0]; + buf0[1].s9 = tmp0[1]; + buf0[2].s9 = tmp0[2]; + buf0[3].s9 = tmp0[3]; + buf1[0].s9 = tmp1[0]; + buf1[1].s9 = tmp1[1]; + buf1[2].s9 = tmp1[2]; + buf1[3].s9 = tmp1[3]; out_len.s9 = tmp_len; break; case 10: - w0[0].sa = tmp0[0]; - w0[1].sa = tmp0[1]; - w0[2].sa = tmp0[2]; - w0[3].sa = tmp0[3]; - w1[0].sa = tmp1[0]; - w1[1].sa = tmp1[1]; - w1[2].sa = tmp1[2]; - w1[3].sa = tmp1[3]; + buf0[0].sa = tmp0[0]; + buf0[1].sa = tmp0[1]; + buf0[2].sa = tmp0[2]; + buf0[3].sa = tmp0[3]; + buf1[0].sa = tmp1[0]; + buf1[1].sa = tmp1[1]; + buf1[2].sa = tmp1[2]; + buf1[3].sa = tmp1[3]; out_len.sa = tmp_len; break; case 11: - w0[0].sb = tmp0[0]; - w0[1].sb = tmp0[1]; - w0[2].sb = tmp0[2]; - w0[3].sb = tmp0[3]; - w1[0].sb = tmp1[0]; - w1[1].sb = tmp1[1]; - w1[2].sb = tmp1[2]; - w1[3].sb = tmp1[3]; + buf0[0].sb = tmp0[0]; + buf0[1].sb = tmp0[1]; + buf0[2].sb = tmp0[2]; + buf0[3].sb = tmp0[3]; + buf1[0].sb = tmp1[0]; + buf1[1].sb = tmp1[1]; + buf1[2].sb = tmp1[2]; + buf1[3].sb = tmp1[3]; out_len.sb = tmp_len; break; case 12: - w0[0].sc = tmp0[0]; - w0[1].sc = tmp0[1]; - w0[2].sc = tmp0[2]; - w0[3].sc = tmp0[3]; - w1[0].sc = tmp1[0]; - w1[1].sc = tmp1[1]; - w1[2].sc = tmp1[2]; - w1[3].sc = tmp1[3]; + buf0[0].sc = tmp0[0]; + buf0[1].sc = tmp0[1]; + buf0[2].sc = tmp0[2]; + buf0[3].sc = tmp0[3]; + buf1[0].sc = tmp1[0]; + buf1[1].sc = tmp1[1]; + buf1[2].sc = tmp1[2]; + buf1[3].sc = tmp1[3]; out_len.sc = tmp_len; break; case 13: - w0[0].sd = tmp0[0]; - w0[1].sd = tmp0[1]; - w0[2].sd = tmp0[2]; - w0[3].sd = tmp0[3]; - w1[0].sd = tmp1[0]; - w1[1].sd = tmp1[1]; - w1[2].sd = tmp1[2]; - w1[3].sd = tmp1[3]; + buf0[0].sd = tmp0[0]; + buf0[1].sd = tmp0[1]; + buf0[2].sd = tmp0[2]; + buf0[3].sd = tmp0[3]; + buf1[0].sd = tmp1[0]; + buf1[1].sd = tmp1[1]; + buf1[2].sd = tmp1[2]; + buf1[3].sd = tmp1[3]; out_len.sd = tmp_len; break; case 14: - w0[0].se = tmp0[0]; - w0[1].se = tmp0[1]; - w0[2].se = tmp0[2]; - w0[3].se = tmp0[3]; - w1[0].se = tmp1[0]; - w1[1].se = tmp1[1]; - w1[2].se = tmp1[2]; - w1[3].se = tmp1[3]; + buf0[0].se = tmp0[0]; + buf0[1].se = tmp0[1]; + buf0[2].se = tmp0[2]; + buf0[3].se = tmp0[3]; + buf1[0].se = tmp1[0]; + buf1[1].se = tmp1[1]; + buf1[2].se = tmp1[2]; + buf1[3].se = tmp1[3]; out_len.se = tmp_len; break; case 15: - w0[0].sf = tmp0[0]; - w0[1].sf = tmp0[1]; - w0[2].sf = tmp0[2]; - w0[3].sf = tmp0[3]; - w1[0].sf = tmp1[0]; - w1[1].sf = tmp1[1]; - w1[2].sf = tmp1[2]; - w1[3].sf = tmp1[3]; + buf0[0].sf = tmp0[0]; + buf0[1].sf = tmp0[1]; + buf0[2].sf = tmp0[2]; + buf0[3].sf = tmp0[3]; + buf1[0].sf = tmp1[0]; + buf1[1].sf = tmp1[1]; + buf1[2].sf = tmp1[2]; + buf1[3].sf = tmp1[3]; out_len.sf = tmp_len; break; #endif diff --git a/include/rp_kernel_on_cpu.h b/include/rp_kernel_on_cpu.h index 90f29b660..8bf340726 100644 --- a/include/rp_kernel_on_cpu.h +++ b/include/rp_kernel_on_cpu.h @@ -6,8 +6,6 @@ #ifndef _RP_KERNEL_ON_CPU_H #define _RP_KERNEL_ON_CPU_H -u32 swap_workaround (const u32 n); - u32 apply_rule (const u32 name, const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len); u32 apply_rules (u32 *cmds, u32 buf0[4], u32 buf1[4], const u32 len); diff --git a/src/rp_kernel_on_cpu.c b/src/rp_kernel_on_cpu.c index fdfbb841e..8bea7b6e0 100644 --- a/src/rp_kernel_on_cpu.c +++ b/src/rp_kernel_on_cpu.c @@ -9,124 +9,124 @@ #include "rp.h" #include "rp_kernel_on_cpu.h" -u32 swap_workaround (const u32 n) +static u32 swap32_S (const u32 value) { - return byte_swap_32 (n); + return byte_swap_32 (value); } -static u32 generate_cmask (u32 buf) +static u32 generate_cmask (const u32 value) { - const u32 rmask = ((buf & 0x40404040) >> 1) - & ~((buf & 0x80808080) >> 2); + const u32 rmask = ((value & 0x40404040u) >> 1u) + & ~((value & 0x80808080u) >> 2u); - const u32 hmask = (buf & 0x1f1f1f1f) + 0x05050505; - const u32 lmask = (buf & 0x1f1f1f1f) + 0x1f1f1f1f; + const u32 hmask = (value & 0x1f1f1f1fu) + 0x05050505u; + const u32 lmask = (value & 0x1f1f1f1fu) + 0x1f1f1f1fu; return rmask & ~hmask & lmask; } -static void truncate_right (u32 w0[4], u32 w1[4], const u32 len) +static void truncate_right (u32 buf0[4], u32 buf1[4], const u32 offset) { - const u32 tmp = (1u << ((len % 4) * 8)) - 1; + const u32 tmp = (1u << ((offset & 3u) * 8u)) - 1u; - switch (len / 4) + switch (offset / 4) { - case 0: w0[0] &= tmp; - w0[1] = 0; - w0[2] = 0; - w0[3] = 0; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; + case 0: buf0[0] &= tmp; + buf0[1] = 0; + buf0[2] = 0; + buf0[3] = 0; + buf1[0] = 0; + buf1[1] = 0; + buf1[2] = 0; + buf1[3] = 0; break; - case 1: w0[1] &= tmp; - w0[2] = 0; - w0[3] = 0; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; + case 1: buf0[1] &= tmp; + buf0[2] = 0; + buf0[3] = 0; + buf1[0] = 0; + buf1[1] = 0; + buf1[2] = 0; + buf1[3] = 0; break; - case 2: w0[2] &= tmp; - w0[3] = 0; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; + case 2: buf0[2] &= tmp; + buf0[3] = 0; + buf1[0] = 0; + buf1[1] = 0; + buf1[2] = 0; + buf1[3] = 0; break; - case 3: w0[3] &= tmp; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; + case 3: buf0[3] &= tmp; + buf1[0] = 0; + buf1[1] = 0; + buf1[2] = 0; + buf1[3] = 0; break; - case 4: w1[0] &= tmp; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; + case 4: buf1[0] &= tmp; + buf1[1] = 0; + buf1[2] = 0; + buf1[3] = 0; break; - case 5: w1[1] &= tmp; - w1[2] = 0; - w1[3] = 0; + case 5: buf1[1] &= tmp; + buf1[2] = 0; + buf1[3] = 0; break; - case 6: w1[2] &= tmp; - w1[3] = 0; + case 6: buf1[2] &= tmp; + buf1[3] = 0; break; - case 7: w1[3] &= tmp; + case 7: buf1[3] &= tmp; break; } } -static void truncate_left (u32 w0[4], u32 w1[4], const u32 len) +static void truncate_left (u32 buf0[4], u32 buf1[4], const u32 offset) { - const u32 tmp = ~((1u << ((len % 4) * 8)) - 1); + const u32 tmp = ~((1u << ((offset & 3u) * 8u)) - 1u); - switch (len / 4) + switch (offset / 4) { - case 0: w0[0] &= tmp; + case 0: buf0[0] &= tmp; break; - case 1: w0[0] = 0; - w0[1] &= tmp; + case 1: buf0[0] = 0; + buf0[1] &= tmp; break; - case 2: w0[0] = 0; - w0[1] = 0; - w0[2] &= tmp; + case 2: buf0[0] = 0; + buf0[1] = 0; + buf0[2] &= tmp; break; - case 3: w0[0] = 0; - w0[1] = 0; - w0[2] = 0; - w0[3] &= tmp; + case 3: buf0[0] = 0; + buf0[1] = 0; + buf0[2] = 0; + buf0[3] &= tmp; break; - case 4: w0[0] = 0; - w0[1] = 0; - w0[2] = 0; - w0[3] = 0; - w1[0] &= tmp; + case 4: buf0[0] = 0; + buf0[1] = 0; + buf0[2] = 0; + buf0[3] = 0; + buf1[0] &= tmp; break; - case 5: w0[0] = 0; - w0[1] = 0; - w0[2] = 0; - w0[3] = 0; - w1[0] = 0; - w1[1] &= tmp; + case 5: buf0[0] = 0; + buf0[1] = 0; + buf0[2] = 0; + buf0[3] = 0; + buf1[0] = 0; + buf1[1] &= tmp; break; - case 6: w0[0] = 0; - w0[1] = 0; - w0[2] = 0; - w0[3] = 0; - w1[0] = 0; - w1[1] = 0; - w1[2] &= tmp; + case 6: buf0[0] = 0; + buf0[1] = 0; + buf0[2] = 0; + buf0[3] = 0; + buf1[0] = 0; + buf1[1] = 0; + buf1[2] &= tmp; break; - case 7: w0[0] = 0; - w0[1] = 0; - w0[2] = 0; - w0[3] = 0; - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] &= tmp; + case 7: buf0[0] = 0; + buf0[1] = 0; + buf0[2] = 0; + buf0[3] = 0; + buf1[0] = 0; + buf1[1] = 0; + buf1[2] = 0; + buf1[3] &= tmp; break; } } @@ -746,31 +746,35 @@ static void lshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 } } -static void append_block1 (const u32 offset, u32 dst0[4], u32 dst1[4], const u32 src_r0) +static void append_block1 (const u32 offset, u32 buf0[4], u32 dst1[4], const u32 src_r0) { // this version works with 1 byte append only - const u32 tmp = (src_r0 & 0xff) << ((offset & 3) * 8); + const u32 value = src_r0 & 0xff; - dst0[0] |= (offset < 4) ? tmp : 0; - dst0[1] |= ((offset >= 4) && (offset < 8)) ? tmp : 0; - dst0[2] |= ((offset >= 8) && (offset < 12)) ? tmp : 0; - dst0[3] |= ((offset >= 12) && (offset < 16)) ? tmp : 0; + const u32 shift = (offset & 3) * 8; + + const u32 tmp = value << shift; + + buf0[0] |= (offset < 4) ? tmp : 0; + buf0[1] |= ((offset >= 4) && (offset < 8)) ? tmp : 0; + buf0[2] |= ((offset >= 8) && (offset < 12)) ? tmp : 0; + buf0[3] |= ((offset >= 12) && (offset < 16)) ? tmp : 0; dst1[0] |= ((offset >= 16) && (offset < 20)) ? tmp : 0; dst1[1] |= ((offset >= 20) && (offset < 24)) ? tmp : 0; dst1[2] |= ((offset >= 24) && (offset < 28)) ? tmp : 0; dst1[3] |= (offset >= 28) ? tmp : 0; } -static void append_block8 (const u32 offset, u32 dst0[4], u32 dst1[4], const u32 src_l0[4], const u32 src_l1[4], const u32 src_r0[4], const u32 src_r1[4]) +static void append_block8 (const u32 offset, u32 buf0[4], u32 dst1[4], const u32 src_l0[4], const u32 src_l1[4], const u32 src_r0[4], const u32 src_r1[4]) { switch (offset) { case 0: - dst0[0] = src_r0[0]; - dst0[1] = src_r0[1]; - dst0[2] = src_r0[2]; - dst0[3] = src_r0[3]; + buf0[0] = src_r0[0]; + buf0[1] = src_r0[1]; + buf0[2] = src_r0[2]; + buf0[3] = src_r0[3]; dst1[0] = src_r1[0]; dst1[1] = src_r1[1]; dst1[2] = src_r1[2]; @@ -778,10 +782,10 @@ static void append_block8 (const u32 offset, u32 dst0[4], u32 dst1[4], const u32 break; case 1: - dst0[0] = src_l0[0] | src_r0[0] << 8; - dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8; - dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8; - dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8; + buf0[0] = src_l0[0] | src_r0[0] << 8; + buf0[1] = src_r0[0] >> 24 | src_r0[1] << 8; + buf0[2] = src_r0[1] >> 24 | src_r0[2] << 8; + buf0[3] = src_r0[2] >> 24 | src_r0[3] << 8; dst1[0] = src_r0[3] >> 24 | src_r1[0] << 8; dst1[1] = src_r1[0] >> 24 | src_r1[1] << 8; dst1[2] = src_r1[1] >> 24 | src_r1[2] << 8; @@ -789,10 +793,10 @@ static void append_block8 (const u32 offset, u32 dst0[4], u32 dst1[4], const u32 break; case 2: - dst0[0] = src_l0[0] | src_r0[0] << 16; - dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16; - dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16; - dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16; + buf0[0] = src_l0[0] | src_r0[0] << 16; + buf0[1] = src_r0[0] >> 16 | src_r0[1] << 16; + buf0[2] = src_r0[1] >> 16 | src_r0[2] << 16; + buf0[3] = src_r0[2] >> 16 | src_r0[3] << 16; dst1[0] = src_r0[3] >> 16 | src_r1[0] << 16; dst1[1] = src_r1[0] >> 16 | src_r1[1] << 16; dst1[2] = src_r1[1] >> 16 | src_r1[2] << 16; @@ -800,10 +804,10 @@ static void append_block8 (const u32 offset, u32 dst0[4], u32 dst1[4], const u32 break; case 3: - dst0[0] = src_l0[0] | src_r0[0] << 24; - dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24; - dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24; - dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24; + buf0[0] = src_l0[0] | src_r0[0] << 24; + buf0[1] = src_r0[0] >> 8 | src_r0[1] << 24; + buf0[2] = src_r0[1] >> 8 | src_r0[2] << 24; + buf0[3] = src_r0[2] >> 8 | src_r0[3] << 24; dst1[0] = src_r0[3] >> 8 | src_r1[0] << 24; dst1[1] = src_r1[0] >> 8 | src_r1[1] << 24; dst1[2] = src_r1[1] >> 8 | src_r1[2] << 24; @@ -811,9 +815,9 @@ static void append_block8 (const u32 offset, u32 dst0[4], u32 dst1[4], const u32 break; case 4: - dst0[1] = src_r0[0]; - dst0[2] = src_r0[1]; - dst0[3] = src_r0[2]; + buf0[1] = src_r0[0]; + buf0[2] = src_r0[1]; + buf0[3] = src_r0[2]; dst1[0] = src_r0[3]; dst1[1] = src_r1[0]; dst1[2] = src_r1[1]; @@ -821,9 +825,9 @@ static void append_block8 (const u32 offset, u32 dst0[4], u32 dst1[4], const u32 break; case 5: - dst0[1] = src_l0[1] | src_r0[0] << 8; - dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8; - dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8; + buf0[1] = src_l0[1] | src_r0[0] << 8; + buf0[2] = src_r0[0] >> 24 | src_r0[1] << 8; + buf0[3] = src_r0[1] >> 24 | src_r0[2] << 8; dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8; dst1[1] = src_r0[3] >> 24 | src_r1[0] << 8; dst1[2] = src_r1[0] >> 24 | src_r1[1] << 8; @@ -831,9 +835,9 @@ static void append_block8 (const u32 offset, u32 dst0[4], u32 dst1[4], const u32 break; case 6: - dst0[1] = src_l0[1] | src_r0[0] << 16; - dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16; - dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16; + buf0[1] = src_l0[1] | src_r0[0] << 16; + buf0[2] = src_r0[0] >> 16 | src_r0[1] << 16; + buf0[3] = src_r0[1] >> 16 | src_r0[2] << 16; dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16; dst1[1] = src_r0[3] >> 16 | src_r1[0] << 16; dst1[2] = src_r1[0] >> 16 | src_r1[1] << 16; @@ -841,9 +845,9 @@ static void append_block8 (const u32 offset, u32 dst0[4], u32 dst1[4], const u32 break; case 7: - dst0[1] = src_l0[1] | src_r0[0] << 24; - dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24; - dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24; + buf0[1] = src_l0[1] | src_r0[0] << 24; + buf0[2] = src_r0[0] >> 8 | src_r0[1] << 24; + buf0[3] = src_r0[1] >> 8 | src_r0[2] << 24; dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24; dst1[1] = src_r0[3] >> 8 | src_r1[0] << 24; dst1[2] = src_r1[0] >> 8 | src_r1[1] << 24; @@ -851,8 +855,8 @@ static void append_block8 (const u32 offset, u32 dst0[4], u32 dst1[4], const u32 break; case 8: - dst0[2] = src_r0[0]; - dst0[3] = src_r0[1]; + buf0[2] = src_r0[0]; + buf0[3] = src_r0[1]; dst1[0] = src_r0[2]; dst1[1] = src_r0[3]; dst1[2] = src_r1[0]; @@ -860,8 +864,8 @@ static void append_block8 (const u32 offset, u32 dst0[4], u32 dst1[4], const u32 break; case 9: - dst0[2] = src_l0[2] | src_r0[0] << 8; - dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8; + buf0[2] = src_l0[2] | src_r0[0] << 8; + buf0[3] = src_r0[0] >> 24 | src_r0[1] << 8; dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8; dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8; dst1[2] = src_r0[3] >> 24 | src_r1[0] << 8; @@ -869,8 +873,8 @@ static void append_block8 (const u32 offset, u32 dst0[4], u32 dst1[4], const u32 break; case 10: - dst0[2] = src_l0[2] | src_r0[0] << 16; - dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16; + buf0[2] = src_l0[2] | src_r0[0] << 16; + buf0[3] = src_r0[0] >> 16 | src_r0[1] << 16; dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16; dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16; dst1[2] = src_r0[3] >> 16 | src_r1[0] << 16; @@ -878,8 +882,8 @@ static void append_block8 (const u32 offset, u32 dst0[4], u32 dst1[4], const u32 break; case 11: - dst0[2] = src_l0[2] | src_r0[0] << 24; - dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24; + buf0[2] = src_l0[2] | src_r0[0] << 24; + buf0[3] = src_r0[0] >> 8 | src_r0[1] << 24; dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24; dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24; dst1[2] = src_r0[3] >> 8 | src_r1[0] << 24; @@ -887,7 +891,7 @@ static void append_block8 (const u32 offset, u32 dst0[4], u32 dst1[4], const u32 break; case 12: - dst0[3] = src_r0[0]; + buf0[3] = src_r0[0]; dst1[0] = src_r0[1]; dst1[1] = src_r0[2]; dst1[2] = src_r0[3]; @@ -895,7 +899,7 @@ static void append_block8 (const u32 offset, u32 dst0[4], u32 dst1[4], const u32 break; case 13: - dst0[3] = src_l0[3] | src_r0[0] << 8; + buf0[3] = src_l0[3] | src_r0[0] << 8; dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8; dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8; dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8; @@ -903,7 +907,7 @@ static void append_block8 (const u32 offset, u32 dst0[4], u32 dst1[4], const u32 break; case 14: - dst0[3] = src_l0[3] | src_r0[0] << 16; + buf0[3] = src_l0[3] | src_r0[0] << 16; dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16; dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16; dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16; @@ -911,7 +915,7 @@ static void append_block8 (const u32 offset, u32 dst0[4], u32 dst1[4], const u32 break; case 15: - dst0[3] = src_l0[3] | src_r0[0] << 24; + buf0[3] = src_l0[3] | src_r0[0] << 24; dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24; dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24; dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24; @@ -1024,14 +1028,14 @@ static void reverse_block (u32 in0[4], u32 in1[4], u32 out0[4], u32 out1[4], con tib41[2] = out0[1]; tib41[3] = out0[0]; - out0[0] = swap_workaround (tib40[0]); - out0[1] = swap_workaround (tib40[1]); - out0[2] = swap_workaround (tib40[2]); - out0[3] = swap_workaround (tib40[3]); - out1[0] = swap_workaround (tib41[0]); - out1[1] = swap_workaround (tib41[1]); - out1[2] = swap_workaround (tib41[2]); - out1[3] = swap_workaround (tib41[3]); + out0[0] = swap32_S (tib40[0]); + out0[1] = swap32_S (tib40[1]); + out0[2] = swap32_S (tib40[2]); + out0[3] = swap32_S (tib40[3]); + out1[0] = swap32_S (tib41[0]); + out1[1] = swap32_S (tib41[1]); + out1[2] = swap32_S (tib41[2]); + out1[3] = swap32_S (tib41[3]); } static u32 rule_op_mangle_lrest (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 buf0[4], MAYBE_UNUSED u32 buf1[4], const u32 in_len)