From 1ed89eb261bf27c9dfbdb5b4a803fededb270878 Mon Sep 17 00:00:00 2001 From: jsteube Date: Sun, 6 Nov 2016 12:25:38 +0100 Subject: [PATCH] Fix rule-engine optimizations for vector datatypes --- OpenCL/inc_rp.cl | 160 +++++++++++++++++++++++------------------------ 1 file changed, 80 insertions(+), 80 deletions(-) diff --git a/OpenCL/inc_rp.cl b/OpenCL/inc_rp.cl index 407154221..b2c579b07 100644 --- a/OpenCL/inc_rp.cl +++ b/OpenCL/inc_rp.cl @@ -774,59 +774,59 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 switch (offset / 4) { case 0: - s8 = amd_bytealign ( 0, src_r1[3], offset_minus_4); - s7 = amd_bytealign (src_r1[3], src_r1[2], offset_minus_4); - s6 = amd_bytealign (src_r1[2], src_r1[1], offset_minus_4); - s5 = amd_bytealign (src_r1[1], src_r1[0], offset_minus_4); - s4 = amd_bytealign (src_r1[0], src_r0[3], offset_minus_4); - s3 = amd_bytealign (src_r0[3], src_r0[2], offset_minus_4); - s2 = amd_bytealign (src_r0[2], src_r0[1], offset_minus_4); - s1 = amd_bytealign (src_r0[1], src_r0[0], offset_minus_4); - s0 = amd_bytealign (src_r0[0], 0, offset_minus_4); + s8 = amd_bytealign_S ( 0, src_r1[3], offset_minus_4); + s7 = amd_bytealign_S (src_r1[3], src_r1[2], offset_minus_4); + s6 = amd_bytealign_S (src_r1[2], src_r1[1], offset_minus_4); + s5 = amd_bytealign_S (src_r1[1], src_r1[0], offset_minus_4); + s4 = amd_bytealign_S (src_r1[0], src_r0[3], offset_minus_4); + s3 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4); + s2 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4); + s1 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4); + s0 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); break; case 1: - s8 = amd_bytealign ( 0, src_r1[2], offset_minus_4); - s7 = amd_bytealign (src_r1[2], src_r1[1], offset_minus_4); - s6 = amd_bytealign (src_r1[1], src_r1[0], offset_minus_4); - s5 = amd_bytealign (src_r1[0], src_r0[3], offset_minus_4); - s4 = amd_bytealign (src_r0[3], src_r0[2], offset_minus_4); - s3 = amd_bytealign (src_r0[2], src_r0[1], offset_minus_4); - s2 = amd_bytealign (src_r0[1], src_r0[0], offset_minus_4); - s1 = amd_bytealign (src_r0[0], 0, offset_minus_4); + s8 = amd_bytealign_S ( 0, src_r1[2], offset_minus_4); + s7 = amd_bytealign_S (src_r1[2], src_r1[1], offset_minus_4); + s6 = amd_bytealign_S (src_r1[1], src_r1[0], offset_minus_4); + s5 = amd_bytealign_S (src_r1[0], src_r0[3], offset_minus_4); + s4 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4); + s3 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4); + s2 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4); + s1 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); s0 = 0; break; case 2: - s8 = amd_bytealign ( 0, src_r1[1], offset_minus_4); - s7 = amd_bytealign (src_r1[1], src_r1[0], offset_minus_4); - s6 = amd_bytealign (src_r1[0], src_r0[3], offset_minus_4); - s5 = amd_bytealign (src_r0[3], src_r0[2], offset_minus_4); - s4 = amd_bytealign (src_r0[2], src_r0[1], offset_minus_4); - s3 = amd_bytealign (src_r0[1], src_r0[0], offset_minus_4); - s2 = amd_bytealign (src_r0[0], 0, offset_minus_4); + s8 = amd_bytealign_S ( 0, src_r1[1], offset_minus_4); + s7 = amd_bytealign_S (src_r1[1], src_r1[0], offset_minus_4); + s6 = amd_bytealign_S (src_r1[0], src_r0[3], offset_minus_4); + s5 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4); + s4 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4); + s3 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4); + s2 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); s1 = 0; s0 = 0; break; case 3: - s8 = amd_bytealign ( 0, src_r1[0], offset_minus_4); - s7 = amd_bytealign (src_r1[0], src_r0[3], offset_minus_4); - s6 = amd_bytealign (src_r0[3], src_r0[2], offset_minus_4); - s5 = amd_bytealign (src_r0[2], src_r0[1], offset_minus_4); - s4 = amd_bytealign (src_r0[1], src_r0[0], offset_minus_4); - s3 = amd_bytealign (src_r0[0], 0, offset_minus_4); + s8 = amd_bytealign_S ( 0, src_r1[0], offset_minus_4); + s7 = amd_bytealign_S (src_r1[0], src_r0[3], offset_minus_4); + s6 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4); + s5 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4); + s4 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4); + s3 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); s2 = 0; s1 = 0; s0 = 0; break; case 4: - s8 = amd_bytealign ( 0, src_r0[3], offset_minus_4); - s7 = amd_bytealign (src_r0[3], src_r0[2], offset_minus_4); - s6 = amd_bytealign (src_r0[2], src_r0[1], offset_minus_4); - s5 = amd_bytealign (src_r0[1], src_r0[0], offset_minus_4); - s4 = amd_bytealign (src_r0[0], 0, offset_minus_4); + s8 = amd_bytealign_S ( 0, src_r0[3], offset_minus_4); + s7 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4); + s6 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4); + s5 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4); + s4 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); s3 = 0; s2 = 0; s1 = 0; @@ -834,10 +834,10 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 break; case 5: - s8 = amd_bytealign ( 0, src_r0[2], offset_minus_4); - s7 = amd_bytealign (src_r0[2], src_r0[1], offset_minus_4); - s6 = amd_bytealign (src_r0[1], src_r0[0], offset_minus_4); - s5 = amd_bytealign (src_r0[0], 0, offset_minus_4); + s8 = amd_bytealign_S ( 0, src_r0[2], offset_minus_4); + s7 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4); + s6 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4); + s5 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); s4 = 0; s3 = 0; s2 = 0; @@ -846,9 +846,9 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 break; case 6: - s8 = amd_bytealign ( 0, src_r0[1], offset_minus_4); - s7 = amd_bytealign (src_r0[1], src_r0[0], offset_minus_4); - s6 = amd_bytealign (src_r0[0], 0, offset_minus_4); + s8 = amd_bytealign_S ( 0, src_r0[1], offset_minus_4); + s7 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4); + s6 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); s5 = 0; s4 = 0; s3 = 0; @@ -858,8 +858,8 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 break; case 7: - s8 = amd_bytealign ( 0, src_r0[0], offset_minus_4); - s7 = amd_bytealign (src_r0[0], 0, offset_minus_4); + s8 = amd_bytealign_S ( 0, src_r0[0], offset_minus_4); + s7 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); s6 = 0; s5 = 0; s4 = 0; @@ -911,44 +911,44 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 switch (offset / 4) { case 0: - s7 = __byte_perm (src_r1[2], src_r1[3], selector); - s6 = __byte_perm (src_r1[1], src_r1[2], selector); - s5 = __byte_perm (src_r1[0], src_r1[1], selector); - s4 = __byte_perm (src_r0[3], src_r1[0], selector); - s3 = __byte_perm (src_r0[2], src_r0[3], selector); - s2 = __byte_perm (src_r0[1], src_r0[2], selector); - s1 = __byte_perm (src_r0[0], src_r0[1], selector); - s0 = __byte_perm ( 0, src_r0[0], selector); + s7 = __byte_perm_S (src_r1[2], src_r1[3], selector); + s6 = __byte_perm_S (src_r1[1], src_r1[2], selector); + s5 = __byte_perm_S (src_r1[0], src_r1[1], selector); + s4 = __byte_perm_S (src_r0[3], src_r1[0], selector); + s3 = __byte_perm_S (src_r0[2], src_r0[3], selector); + s2 = __byte_perm_S (src_r0[1], src_r0[2], selector); + s1 = __byte_perm_S (src_r0[0], src_r0[1], selector); + s0 = __byte_perm_S ( 0, src_r0[0], selector); break; case 1: - s7 = __byte_perm (src_r1[1], src_r1[2], selector); - s6 = __byte_perm (src_r1[0], src_r1[1], selector); - s5 = __byte_perm (src_r0[3], src_r1[0], selector); - s4 = __byte_perm (src_r0[2], src_r0[3], selector); - s3 = __byte_perm (src_r0[1], src_r0[2], selector); - s2 = __byte_perm (src_r0[0], src_r0[1], selector); - s1 = __byte_perm ( 0, src_r0[0], selector); + s7 = __byte_perm_S (src_r1[1], src_r1[2], selector); + s6 = __byte_perm_S (src_r1[0], src_r1[1], selector); + s5 = __byte_perm_S (src_r0[3], src_r1[0], selector); + s4 = __byte_perm_S (src_r0[2], src_r0[3], selector); + s3 = __byte_perm_S (src_r0[1], src_r0[2], selector); + s2 = __byte_perm_S (src_r0[0], src_r0[1], selector); + s1 = __byte_perm_S ( 0, src_r0[0], selector); s0 = 0; break; case 2: - s7 = __byte_perm (src_r1[0], src_r1[1], selector); - s6 = __byte_perm (src_r0[3], src_r1[0], selector); - s5 = __byte_perm (src_r0[2], src_r0[3], selector); - s4 = __byte_perm (src_r0[1], src_r0[2], selector); - s3 = __byte_perm (src_r0[0], src_r0[1], selector); - s2 = __byte_perm ( 0, src_r0[0], selector); + s7 = __byte_perm_S (src_r1[0], src_r1[1], selector); + s6 = __byte_perm_S (src_r0[3], src_r1[0], selector); + s5 = __byte_perm_S (src_r0[2], src_r0[3], selector); + s4 = __byte_perm_S (src_r0[1], src_r0[2], selector); + s3 = __byte_perm_S (src_r0[0], src_r0[1], selector); + s2 = __byte_perm_S ( 0, src_r0[0], selector); s1 = 0; s0 = 0; break; case 3: - s7 = __byte_perm (src_r0[3], src_r1[0], selector); - s6 = __byte_perm (src_r0[2], src_r0[3], selector); - s5 = __byte_perm (src_r0[1], src_r0[2], selector); - s4 = __byte_perm (src_r0[0], src_r0[1], selector); - s3 = __byte_perm ( 0, src_r0[0], selector); + s7 = __byte_perm_S (src_r0[3], src_r1[0], selector); + s6 = __byte_perm_S (src_r0[2], src_r0[3], selector); + s5 = __byte_perm_S (src_r0[1], src_r0[2], selector); + s4 = __byte_perm_S (src_r0[0], src_r0[1], selector); + s3 = __byte_perm_S ( 0, src_r0[0], selector); s2 = 0; s1 = 0; s0 = 0; @@ -956,10 +956,10 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 break; case 4: - s7 = __byte_perm (src_r0[2], src_r0[3], selector); - s6 = __byte_perm (src_r0[1], src_r0[2], selector); - s5 = __byte_perm (src_r0[0], src_r0[1], selector); - s4 = __byte_perm ( 0, src_r0[0], selector); + s7 = __byte_perm_S (src_r0[2], src_r0[3], selector); + s6 = __byte_perm_S (src_r0[1], src_r0[2], selector); + s5 = __byte_perm_S (src_r0[0], src_r0[1], selector); + s4 = __byte_perm_S ( 0, src_r0[0], selector); s3 = 0; s2 = 0; s1 = 0; @@ -967,9 +967,9 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 break; case 5: - s7 = __byte_perm (src_r0[1], src_r0[2], selector); - s6 = __byte_perm (src_r0[0], src_r0[1], selector); - s5 = __byte_perm ( 0, src_r0[0], selector); + s7 = __byte_perm_S (src_r0[1], src_r0[2], selector); + s6 = __byte_perm_S (src_r0[0], src_r0[1], selector); + s5 = __byte_perm_S ( 0, src_r0[0], selector); s4 = 0; s3 = 0; s2 = 0; @@ -978,8 +978,8 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 break; case 6: - s7 = __byte_perm (src_r0[0], src_r0[1], selector); - s6 = __byte_perm ( 0, src_r0[0], selector); + s7 = __byte_perm_S (src_r0[0], src_r0[1], selector); + s6 = __byte_perm_S ( 0, src_r0[0], selector); s5 = 0; s4 = 0; s3 = 0; @@ -989,7 +989,7 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 break; case 7: - s7 = __byte_perm ( 0, src_r0[0], selector); + s7 = __byte_perm_S ( 0, src_r0[0], selector); s6 = 0; s5 = 0; s4 = 0;