Fix rule-engine optimizations for vector datatypes

pull/574/head
jsteube 8 years ago
parent 7a42c81123
commit 1ed89eb261

@ -774,59 +774,59 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32
switch (offset / 4)
{
case 0:
s8 = amd_bytealign ( 0, src_r1[3], offset_minus_4);
s7 = amd_bytealign (src_r1[3], src_r1[2], offset_minus_4);
s6 = amd_bytealign (src_r1[2], src_r1[1], offset_minus_4);
s5 = amd_bytealign (src_r1[1], src_r1[0], offset_minus_4);
s4 = amd_bytealign (src_r1[0], src_r0[3], offset_minus_4);
s3 = amd_bytealign (src_r0[3], src_r0[2], offset_minus_4);
s2 = amd_bytealign (src_r0[2], src_r0[1], offset_minus_4);
s1 = amd_bytealign (src_r0[1], src_r0[0], offset_minus_4);
s0 = amd_bytealign (src_r0[0], 0, offset_minus_4);
s8 = amd_bytealign_S ( 0, src_r1[3], offset_minus_4);
s7 = amd_bytealign_S (src_r1[3], src_r1[2], offset_minus_4);
s6 = amd_bytealign_S (src_r1[2], src_r1[1], offset_minus_4);
s5 = amd_bytealign_S (src_r1[1], src_r1[0], offset_minus_4);
s4 = amd_bytealign_S (src_r1[0], src_r0[3], offset_minus_4);
s3 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4);
s2 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4);
s1 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4);
s0 = amd_bytealign_S (src_r0[0], 0, offset_minus_4);
break;
case 1:
s8 = amd_bytealign ( 0, src_r1[2], offset_minus_4);
s7 = amd_bytealign (src_r1[2], src_r1[1], offset_minus_4);
s6 = amd_bytealign (src_r1[1], src_r1[0], offset_minus_4);
s5 = amd_bytealign (src_r1[0], src_r0[3], offset_minus_4);
s4 = amd_bytealign (src_r0[3], src_r0[2], offset_minus_4);
s3 = amd_bytealign (src_r0[2], src_r0[1], offset_minus_4);
s2 = amd_bytealign (src_r0[1], src_r0[0], offset_minus_4);
s1 = amd_bytealign (src_r0[0], 0, offset_minus_4);
s8 = amd_bytealign_S ( 0, src_r1[2], offset_minus_4);
s7 = amd_bytealign_S (src_r1[2], src_r1[1], offset_minus_4);
s6 = amd_bytealign_S (src_r1[1], src_r1[0], offset_minus_4);
s5 = amd_bytealign_S (src_r1[0], src_r0[3], offset_minus_4);
s4 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4);
s3 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4);
s2 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4);
s1 = amd_bytealign_S (src_r0[0], 0, offset_minus_4);
s0 = 0;
break;
case 2:
s8 = amd_bytealign ( 0, src_r1[1], offset_minus_4);
s7 = amd_bytealign (src_r1[1], src_r1[0], offset_minus_4);
s6 = amd_bytealign (src_r1[0], src_r0[3], offset_minus_4);
s5 = amd_bytealign (src_r0[3], src_r0[2], offset_minus_4);
s4 = amd_bytealign (src_r0[2], src_r0[1], offset_minus_4);
s3 = amd_bytealign (src_r0[1], src_r0[0], offset_minus_4);
s2 = amd_bytealign (src_r0[0], 0, offset_minus_4);
s8 = amd_bytealign_S ( 0, src_r1[1], offset_minus_4);
s7 = amd_bytealign_S (src_r1[1], src_r1[0], offset_minus_4);
s6 = amd_bytealign_S (src_r1[0], src_r0[3], offset_minus_4);
s5 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4);
s4 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4);
s3 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4);
s2 = amd_bytealign_S (src_r0[0], 0, offset_minus_4);
s1 = 0;
s0 = 0;
break;
case 3:
s8 = amd_bytealign ( 0, src_r1[0], offset_minus_4);
s7 = amd_bytealign (src_r1[0], src_r0[3], offset_minus_4);
s6 = amd_bytealign (src_r0[3], src_r0[2], offset_minus_4);
s5 = amd_bytealign (src_r0[2], src_r0[1], offset_minus_4);
s4 = amd_bytealign (src_r0[1], src_r0[0], offset_minus_4);
s3 = amd_bytealign (src_r0[0], 0, offset_minus_4);
s8 = amd_bytealign_S ( 0, src_r1[0], offset_minus_4);
s7 = amd_bytealign_S (src_r1[0], src_r0[3], offset_minus_4);
s6 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4);
s5 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4);
s4 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4);
s3 = amd_bytealign_S (src_r0[0], 0, offset_minus_4);
s2 = 0;
s1 = 0;
s0 = 0;
break;
case 4:
s8 = amd_bytealign ( 0, src_r0[3], offset_minus_4);
s7 = amd_bytealign (src_r0[3], src_r0[2], offset_minus_4);
s6 = amd_bytealign (src_r0[2], src_r0[1], offset_minus_4);
s5 = amd_bytealign (src_r0[1], src_r0[0], offset_minus_4);
s4 = amd_bytealign (src_r0[0], 0, offset_minus_4);
s8 = amd_bytealign_S ( 0, src_r0[3], offset_minus_4);
s7 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4);
s6 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4);
s5 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4);
s4 = amd_bytealign_S (src_r0[0], 0, offset_minus_4);
s3 = 0;
s2 = 0;
s1 = 0;
@ -834,10 +834,10 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32
break;
case 5:
s8 = amd_bytealign ( 0, src_r0[2], offset_minus_4);
s7 = amd_bytealign (src_r0[2], src_r0[1], offset_minus_4);
s6 = amd_bytealign (src_r0[1], src_r0[0], offset_minus_4);
s5 = amd_bytealign (src_r0[0], 0, offset_minus_4);
s8 = amd_bytealign_S ( 0, src_r0[2], offset_minus_4);
s7 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4);
s6 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4);
s5 = amd_bytealign_S (src_r0[0], 0, offset_minus_4);
s4 = 0;
s3 = 0;
s2 = 0;
@ -846,9 +846,9 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32
break;
case 6:
s8 = amd_bytealign ( 0, src_r0[1], offset_minus_4);
s7 = amd_bytealign (src_r0[1], src_r0[0], offset_minus_4);
s6 = amd_bytealign (src_r0[0], 0, offset_minus_4);
s8 = amd_bytealign_S ( 0, src_r0[1], offset_minus_4);
s7 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4);
s6 = amd_bytealign_S (src_r0[0], 0, offset_minus_4);
s5 = 0;
s4 = 0;
s3 = 0;
@ -858,8 +858,8 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32
break;
case 7:
s8 = amd_bytealign ( 0, src_r0[0], offset_minus_4);
s7 = amd_bytealign (src_r0[0], 0, offset_minus_4);
s8 = amd_bytealign_S ( 0, src_r0[0], offset_minus_4);
s7 = amd_bytealign_S (src_r0[0], 0, offset_minus_4);
s6 = 0;
s5 = 0;
s4 = 0;
@ -911,44 +911,44 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32
switch (offset / 4)
{
case 0:
s7 = __byte_perm (src_r1[2], src_r1[3], selector);
s6 = __byte_perm (src_r1[1], src_r1[2], selector);
s5 = __byte_perm (src_r1[0], src_r1[1], selector);
s4 = __byte_perm (src_r0[3], src_r1[0], selector);
s3 = __byte_perm (src_r0[2], src_r0[3], selector);
s2 = __byte_perm (src_r0[1], src_r0[2], selector);
s1 = __byte_perm (src_r0[0], src_r0[1], selector);
s0 = __byte_perm ( 0, src_r0[0], selector);
s7 = __byte_perm_S (src_r1[2], src_r1[3], selector);
s6 = __byte_perm_S (src_r1[1], src_r1[2], selector);
s5 = __byte_perm_S (src_r1[0], src_r1[1], selector);
s4 = __byte_perm_S (src_r0[3], src_r1[0], selector);
s3 = __byte_perm_S (src_r0[2], src_r0[3], selector);
s2 = __byte_perm_S (src_r0[1], src_r0[2], selector);
s1 = __byte_perm_S (src_r0[0], src_r0[1], selector);
s0 = __byte_perm_S ( 0, src_r0[0], selector);
break;
case 1:
s7 = __byte_perm (src_r1[1], src_r1[2], selector);
s6 = __byte_perm (src_r1[0], src_r1[1], selector);
s5 = __byte_perm (src_r0[3], src_r1[0], selector);
s4 = __byte_perm (src_r0[2], src_r0[3], selector);
s3 = __byte_perm (src_r0[1], src_r0[2], selector);
s2 = __byte_perm (src_r0[0], src_r0[1], selector);
s1 = __byte_perm ( 0, src_r0[0], selector);
s7 = __byte_perm_S (src_r1[1], src_r1[2], selector);
s6 = __byte_perm_S (src_r1[0], src_r1[1], selector);
s5 = __byte_perm_S (src_r0[3], src_r1[0], selector);
s4 = __byte_perm_S (src_r0[2], src_r0[3], selector);
s3 = __byte_perm_S (src_r0[1], src_r0[2], selector);
s2 = __byte_perm_S (src_r0[0], src_r0[1], selector);
s1 = __byte_perm_S ( 0, src_r0[0], selector);
s0 = 0;
break;
case 2:
s7 = __byte_perm (src_r1[0], src_r1[1], selector);
s6 = __byte_perm (src_r0[3], src_r1[0], selector);
s5 = __byte_perm (src_r0[2], src_r0[3], selector);
s4 = __byte_perm (src_r0[1], src_r0[2], selector);
s3 = __byte_perm (src_r0[0], src_r0[1], selector);
s2 = __byte_perm ( 0, src_r0[0], selector);
s7 = __byte_perm_S (src_r1[0], src_r1[1], selector);
s6 = __byte_perm_S (src_r0[3], src_r1[0], selector);
s5 = __byte_perm_S (src_r0[2], src_r0[3], selector);
s4 = __byte_perm_S (src_r0[1], src_r0[2], selector);
s3 = __byte_perm_S (src_r0[0], src_r0[1], selector);
s2 = __byte_perm_S ( 0, src_r0[0], selector);
s1 = 0;
s0 = 0;
break;
case 3:
s7 = __byte_perm (src_r0[3], src_r1[0], selector);
s6 = __byte_perm (src_r0[2], src_r0[3], selector);
s5 = __byte_perm (src_r0[1], src_r0[2], selector);
s4 = __byte_perm (src_r0[0], src_r0[1], selector);
s3 = __byte_perm ( 0, src_r0[0], selector);
s7 = __byte_perm_S (src_r0[3], src_r1[0], selector);
s6 = __byte_perm_S (src_r0[2], src_r0[3], selector);
s5 = __byte_perm_S (src_r0[1], src_r0[2], selector);
s4 = __byte_perm_S (src_r0[0], src_r0[1], selector);
s3 = __byte_perm_S ( 0, src_r0[0], selector);
s2 = 0;
s1 = 0;
s0 = 0;
@ -956,10 +956,10 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32
break;
case 4:
s7 = __byte_perm (src_r0[2], src_r0[3], selector);
s6 = __byte_perm (src_r0[1], src_r0[2], selector);
s5 = __byte_perm (src_r0[0], src_r0[1], selector);
s4 = __byte_perm ( 0, src_r0[0], selector);
s7 = __byte_perm_S (src_r0[2], src_r0[3], selector);
s6 = __byte_perm_S (src_r0[1], src_r0[2], selector);
s5 = __byte_perm_S (src_r0[0], src_r0[1], selector);
s4 = __byte_perm_S ( 0, src_r0[0], selector);
s3 = 0;
s2 = 0;
s1 = 0;
@ -967,9 +967,9 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32
break;
case 5:
s7 = __byte_perm (src_r0[1], src_r0[2], selector);
s6 = __byte_perm (src_r0[0], src_r0[1], selector);
s5 = __byte_perm ( 0, src_r0[0], selector);
s7 = __byte_perm_S (src_r0[1], src_r0[2], selector);
s6 = __byte_perm_S (src_r0[0], src_r0[1], selector);
s5 = __byte_perm_S ( 0, src_r0[0], selector);
s4 = 0;
s3 = 0;
s2 = 0;
@ -978,8 +978,8 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32
break;
case 6:
s7 = __byte_perm (src_r0[0], src_r0[1], selector);
s6 = __byte_perm ( 0, src_r0[0], selector);
s7 = __byte_perm_S (src_r0[0], src_r0[1], selector);
s6 = __byte_perm_S ( 0, src_r0[0], selector);
s5 = 0;
s4 = 0;
s3 = 0;
@ -989,7 +989,7 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32
break;
case 7:
s7 = __byte_perm ( 0, src_r0[0], selector);
s7 = __byte_perm_S ( 0, src_r0[0], selector);
s6 = 0;
s5 = 0;
s4 = 0;

Loading…
Cancel
Save