Workaround AMD catalyst optimizer bug in rule-engine

pull/142/head
Jens Steube 8 years ago
parent 5c07a412ec
commit 62f66a8784

@ -1651,268 +1651,213 @@ static void append_block8 (const u32 offset, u32 dst0[4], u32 dst1[4], const u32
#if defined IS_AMD || defined IS_GENERIC #if defined IS_AMD || defined IS_GENERIC
switch (offset) switch (offset)
{ {
case 0: case 31:
dst0[0] = src_r0[0]; dst1[3] = src_l1[3] | src_r0[0] << 24;
dst0[1] = src_r0[1];
dst0[2] = src_r0[2];
dst0[3] = src_r0[3];
dst1[0] = src_r1[0];
dst1[1] = src_r1[1];
dst1[2] = src_r1[2];
dst1[3] = src_r1[3];
break;
case 1:
dst0[0] = src_l0[0]
| src_r0[0] << 8;
dst0[1] = amd_bytealign (src_r0[1], src_r0[0], 3);
dst0[2] = amd_bytealign (src_r0[2], src_r0[1], 3);
dst0[3] = amd_bytealign (src_r0[3], src_r0[2], 3);
dst1[0] = amd_bytealign (src_r1[0], src_r0[3], 3);
dst1[1] = amd_bytealign (src_r1[1], src_r1[0], 3);
dst1[2] = amd_bytealign (src_r1[2], src_r1[1], 3);
dst1[3] = amd_bytealign (src_r1[3], src_r1[2], 3);
break; break;
case 30:
case 2: dst1[3] = src_l1[3] | src_r0[0] << 16;
dst0[0] = src_l0[0]
| src_r0[0] << 16;
dst0[1] = amd_bytealign (src_r0[1], src_r0[0], 2);
dst0[2] = amd_bytealign (src_r0[2], src_r0[1], 2);
dst0[3] = amd_bytealign (src_r0[3], src_r0[2], 2);
dst1[0] = amd_bytealign (src_r1[0], src_r0[3], 2);
dst1[1] = amd_bytealign (src_r1[1], src_r1[0], 2);
dst1[2] = amd_bytealign (src_r1[2], src_r1[1], 2);
dst1[3] = amd_bytealign (src_r1[3], src_r1[2], 2);
break; break;
case 29:
case 3: dst1[3] = src_l1[3] | src_r0[0] << 8;
dst0[0] = src_l0[0]
| src_r0[0] << 24;
dst0[1] = amd_bytealign (src_r0[1], src_r0[0], 1);
dst0[2] = amd_bytealign (src_r0[2], src_r0[1], 1);
dst0[3] = amd_bytealign (src_r0[3], src_r0[2], 1);
dst1[0] = amd_bytealign (src_r1[0], src_r0[3], 1);
dst1[1] = amd_bytealign (src_r1[1], src_r1[0], 1);
dst1[2] = amd_bytealign (src_r1[2], src_r1[1], 1);
dst1[3] = amd_bytealign (src_r1[3], src_r1[2], 1);
break; break;
case 28:
case 4: dst1[3] = src_r0[0];
dst0[1] = src_r0[0];
dst0[2] = src_r0[1];
dst0[3] = src_r0[2];
dst1[0] = src_r0[3];
dst1[1] = src_r1[0];
dst1[2] = src_r1[1];
dst1[3] = src_r1[2];
break; break;
case 27:
case 5: dst1[3] = amd_bytealign (src_r0[1], src_r0[0], 1);
dst0[1] = src_l0[1] dst1[2] = src_l1[2] | src_r0[0] << 24;
| src_r0[0] << 8;
dst0[2] = amd_bytealign (src_r0[1], src_r0[0], 3);
dst0[3] = amd_bytealign (src_r0[2], src_r0[1], 3);
dst1[0] = amd_bytealign (src_r0[3], src_r0[2], 3);
dst1[1] = amd_bytealign (src_r1[0], src_r0[3], 3);
dst1[2] = amd_bytealign (src_r1[1], src_r1[0], 3);
dst1[3] = amd_bytealign (src_r1[2], src_r1[1], 3);
break; break;
case 26:
case 6: dst1[3] = amd_bytealign (src_r0[1], src_r0[0], 2);
dst0[1] = src_l0[1] dst1[2] = src_l1[2] | src_r0[0] << 16;
| src_r0[0] << 16;
dst0[2] = amd_bytealign (src_r0[1], src_r0[0], 2);
dst0[3] = amd_bytealign (src_r0[2], src_r0[1], 2);
dst1[0] = amd_bytealign (src_r0[3], src_r0[2], 2);
dst1[1] = amd_bytealign (src_r1[0], src_r0[3], 2);
dst1[2] = amd_bytealign (src_r1[1], src_r1[0], 2);
dst1[3] = amd_bytealign (src_r1[2], src_r1[1], 2);
break; break;
case 25:
case 7: dst1[3] = amd_bytealign (src_r0[1], src_r0[0], 3);
dst0[1] = src_l0[1] dst1[2] = src_l1[2] | src_r0[0] << 8;
| src_r0[0] << 24;
dst0[2] = amd_bytealign (src_r0[1], src_r0[0], 1);
dst0[3] = amd_bytealign (src_r0[2], src_r0[1], 1);
dst1[0] = amd_bytealign (src_r0[3], src_r0[2], 1);
dst1[1] = amd_bytealign (src_r1[0], src_r0[3], 1);
dst1[2] = amd_bytealign (src_r1[1], src_r1[0], 1);
dst1[3] = amd_bytealign (src_r1[2], src_r1[1], 1);
break; break;
case 24:
case 8: dst1[3] = src_r0[1];
dst0[2] = src_r0[0]; dst1[2] = src_r0[0];
dst0[3] = src_r0[1];
dst1[0] = src_r0[2];
dst1[1] = src_r0[3];
dst1[2] = src_r1[0];
dst1[3] = src_r1[1];
break; break;
case 23:
case 9: dst1[3] = amd_bytealign (src_r0[2], src_r0[1], 1);
dst0[2] = src_l0[2] dst1[2] = amd_bytealign (src_r0[1], src_r0[0], 1);
| src_r0[0] << 8; dst1[1] = src_l1[1] | src_r0[0] << 24;
dst0[3] = amd_bytealign (src_r0[1], src_r0[0], 3);
dst1[0] = amd_bytealign (src_r0[2], src_r0[1], 3);
dst1[1] = amd_bytealign (src_r0[3], src_r0[2], 3);
dst1[2] = amd_bytealign (src_r1[0], src_r0[3], 3);
dst1[3] = amd_bytealign (src_r1[1], src_r1[0], 3);
break; break;
case 22:
case 10: dst1[3] = amd_bytealign (src_r0[2], src_r0[1], 2);
dst0[2] = src_l0[2] dst1[2] = amd_bytealign (src_r0[1], src_r0[0], 2);
| src_r0[0] << 16; dst1[1] = src_l1[1] | src_r0[0] << 16;
dst0[3] = amd_bytealign (src_r0[1], src_r0[0], 2);
dst1[0] = amd_bytealign (src_r0[2], src_r0[1], 2);
dst1[1] = amd_bytealign (src_r0[3], src_r0[2], 2);
dst1[2] = amd_bytealign (src_r1[0], src_r0[3], 2);
dst1[3] = amd_bytealign (src_r1[1], src_r1[0], 2);
break; break;
case 21:
case 11: dst1[3] = amd_bytealign (src_r0[2], src_r0[1], 3);
dst0[2] = src_l0[2] dst1[2] = amd_bytealign (src_r0[1], src_r0[0], 3);
| src_r0[0] << 24; dst1[1] = src_l1[1] | src_r0[0] << 8;
dst0[3] = amd_bytealign (src_r0[1], src_r0[0], 1);
dst1[0] = amd_bytealign (src_r0[2], src_r0[1], 1);
dst1[1] = amd_bytealign (src_r0[3], src_r0[2], 1);
dst1[2] = amd_bytealign (src_r1[0], src_r0[3], 1);
dst1[3] = amd_bytealign (src_r1[1], src_r1[0], 1);
break; break;
case 20:
case 12: dst1[3] = src_r0[2];
dst0[3] = src_r0[0]; dst1[2] = src_r0[1];
dst1[0] = src_r0[1]; dst1[1] = src_r0[0];
dst1[1] = src_r0[2];
dst1[2] = src_r0[3];
dst1[3] = src_r1[0];
break; break;
case 19:
case 13: dst1[3] = amd_bytealign (src_r0[3], src_r0[2], 1);
dst0[3] = src_l0[3] dst1[2] = amd_bytealign (src_r0[2], src_r0[1], 1);
| src_r0[0] << 8; dst1[1] = amd_bytealign (src_r0[1], src_r0[0], 1);
dst1[0] = amd_bytealign (src_r0[1], src_r0[0], 3); dst1[0] = src_l1[0] | src_r0[0] << 24;
dst1[1] = amd_bytealign (src_r0[2], src_r0[1], 3);
dst1[2] = amd_bytealign (src_r0[3], src_r0[2], 3);
dst1[3] = amd_bytealign (src_r1[0], src_r0[3], 3);
break; break;
case 18:
case 14: dst1[3] = amd_bytealign (src_r0[3], src_r0[2], 2);
dst0[3] = src_l0[3] dst1[2] = amd_bytealign (src_r0[2], src_r0[1], 2);
| src_r0[0] << 16; dst1[1] = amd_bytealign (src_r0[1], src_r0[0], 2);
dst1[0] = amd_bytealign (src_r0[1], src_r0[0], 2); dst1[0] = src_l1[0] | src_r0[0] << 16;
dst1[1] = amd_bytealign (src_r0[2], src_r0[1], 2);
dst1[2] = amd_bytealign (src_r0[3], src_r0[2], 2);
dst1[3] = amd_bytealign (src_r1[0], src_r0[3], 2);
break; break;
case 17:
case 15: dst1[3] = amd_bytealign (src_r0[3], src_r0[2], 3);
dst0[3] = src_l0[3] dst1[2] = amd_bytealign (src_r0[2], src_r0[1], 3);
| src_r0[0] << 24; dst1[1] = amd_bytealign (src_r0[1], src_r0[0], 3);
dst1[0] = amd_bytealign (src_r0[1], src_r0[0], 1); dst1[0] = src_l1[0] | src_r0[0] << 8;
dst1[1] = amd_bytealign (src_r0[2], src_r0[1], 1);
dst1[2] = amd_bytealign (src_r0[3], src_r0[2], 1);
dst1[3] = amd_bytealign (src_r1[0], src_r0[3], 1);
break; break;
case 16: case 16:
dst1[0] = src_r0[0];
dst1[1] = src_r0[1];
dst1[2] = src_r0[2];
dst1[3] = src_r0[3]; dst1[3] = src_r0[3];
dst1[2] = src_r0[2];
dst1[1] = src_r0[1];
dst1[0] = src_r0[0];
break; break;
case 15:
case 17: dst1[3] = amd_bytealign (src_r1[0], src_r0[3], 1);
dst1[0] = src_l1[0] dst1[2] = amd_bytealign (src_r0[3], src_r0[2], 1);
| src_r0[0] << 8; dst1[1] = amd_bytealign (src_r0[2], src_r0[1], 1);
dst1[1] = amd_bytealign (src_r0[1], src_r0[0], 3); dst1[0] = amd_bytealign (src_r0[1], src_r0[0], 1);
dst1[2] = amd_bytealign (src_r0[2], src_r0[1], 3); dst0[3] = src_l0[3] | src_r0[0] << 24;
dst1[3] = amd_bytealign (src_r0[3], src_r0[2], 3);
break; break;
case 14:
case 18: dst1[3] = amd_bytealign (src_r1[0], src_r0[3], 2);
dst1[0] = src_l1[0] dst1[2] = amd_bytealign (src_r0[3], src_r0[2], 2);
| src_r0[0] << 16; dst1[1] = amd_bytealign (src_r0[2], src_r0[1], 2);
dst1[1] = amd_bytealign (src_r0[1], src_r0[0], 2); dst1[0] = amd_bytealign (src_r0[1], src_r0[0], 2);
dst1[2] = amd_bytealign (src_r0[2], src_r0[1], 2); dst0[3] = src_l0[3] | src_r0[0] << 16;
dst1[3] = amd_bytealign (src_r0[3], src_r0[2], 2);
break; break;
case 13:
case 19: dst1[3] = amd_bytealign (src_r1[0], src_r0[3], 3);
dst1[0] = src_l1[0] dst1[2] = amd_bytealign (src_r0[3], src_r0[2], 3);
| src_r0[0] << 24; dst1[1] = amd_bytealign (src_r0[2], src_r0[1], 3);
dst1[1] = amd_bytealign (src_r0[1], src_r0[0], 1); dst1[0] = amd_bytealign (src_r0[1], src_r0[0], 3);
dst1[2] = amd_bytealign (src_r0[2], src_r0[1], 1); dst0[3] = src_l0[3] | src_r0[0] << 8;
dst1[3] = amd_bytealign (src_r0[3], src_r0[2], 1);
break; break;
case 12:
case 20: dst1[3] = src_r1[0];
dst1[1] = src_r0[0]; dst1[2] = src_r0[3];
dst1[2] = src_r0[1]; dst1[1] = src_r0[2];
dst1[3] = src_r0[2]; dst1[0] = src_r0[1];
dst0[3] = src_r0[0];
break; break;
case 11:
case 21: dst1[3] = amd_bytealign (src_r1[1], src_r1[0], 1);
dst1[1] = src_l1[1] dst1[2] = amd_bytealign (src_r1[0], src_r0[3], 1);
| src_r0[0] << 8; dst1[1] = amd_bytealign (src_r0[3], src_r0[2], 1);
dst1[2] = amd_bytealign (src_r0[1], src_r0[0], 3); dst1[0] = amd_bytealign (src_r0[2], src_r0[1], 1);
dst1[3] = amd_bytealign (src_r0[2], src_r0[1], 3); dst0[3] = amd_bytealign (src_r0[1], src_r0[0], 1);
dst0[2] = src_l0[2] | src_r0[0] << 24;
break; break;
case 10:
case 22: dst1[3] = amd_bytealign (src_r1[1], src_r1[0], 2);
dst1[1] = src_l1[1] dst1[2] = amd_bytealign (src_r1[0], src_r0[3], 2);
| src_r0[0] << 16; dst1[1] = amd_bytealign (src_r0[3], src_r0[2], 2);
dst1[2] = amd_bytealign (src_r0[1], src_r0[0], 2); dst1[0] = amd_bytealign (src_r0[2], src_r0[1], 2);
dst1[3] = amd_bytealign (src_r0[2], src_r0[1], 2); dst0[3] = amd_bytealign (src_r0[1], src_r0[0], 2);
dst0[2] = src_l0[2] | src_r0[0] << 16;
break; break;
case 9:
case 23: dst1[3] = amd_bytealign (src_r1[1], src_r1[0], 3);
dst1[1] = src_l1[1] dst1[2] = amd_bytealign (src_r1[0], src_r0[3], 3);
| src_r0[0] << 24; dst1[1] = amd_bytealign (src_r0[3], src_r0[2], 3);
dst1[2] = amd_bytealign (src_r0[1], src_r0[0], 1); dst1[0] = amd_bytealign (src_r0[2], src_r0[1], 3);
dst1[3] = amd_bytealign (src_r0[2], src_r0[1], 1); dst0[3] = amd_bytealign (src_r0[1], src_r0[0], 3);
dst0[2] = src_l0[2] | src_r0[0] << 8;
break; break;
case 8:
case 24: dst1[3] = src_r1[1];
dst1[2] = src_r0[0]; dst1[2] = src_r1[0];
dst1[3] = src_r0[1]; dst1[1] = src_r0[3];
dst1[0] = src_r0[2];
dst0[3] = src_r0[1];
dst0[2] = src_r0[0];
break; break;
case 7:
case 25: dst1[3] = amd_bytealign (src_r1[2], src_r1[1], 1);
dst1[2] = src_l1[2] dst1[2] = amd_bytealign (src_r1[1], src_r1[0], 1);
| src_r0[0] << 8; dst1[1] = amd_bytealign (src_r1[0], src_r0[3], 1);
dst1[3] = amd_bytealign (src_r0[1], src_r0[0], 3); dst1[0] = amd_bytealign (src_r0[3], src_r0[2], 1);
dst0[3] = amd_bytealign (src_r0[2], src_r0[1], 1);
dst0[2] = amd_bytealign (src_r0[1], src_r0[0], 1);
dst0[1] = src_l0[1] | src_r0[0] << 24;
break; break;
case 6:
case 26: dst1[3] = amd_bytealign (src_r1[2], src_r1[1], 2);
dst1[2] = src_l1[2] dst1[2] = amd_bytealign (src_r1[1], src_r1[0], 2);
| src_r0[0] << 16; dst1[1] = amd_bytealign (src_r1[0], src_r0[3], 2);
dst1[3] = amd_bytealign (src_r0[1], src_r0[0], 2); dst1[0] = amd_bytealign (src_r0[3], src_r0[2], 2);
dst0[3] = amd_bytealign (src_r0[2], src_r0[1], 2);
dst0[2] = amd_bytealign (src_r0[1], src_r0[0], 2);
dst0[1] = src_l0[1] | src_r0[0] << 16;
break; break;
case 5:
case 27: dst1[3] = amd_bytealign (src_r1[2], src_r1[1], 3);
dst1[2] = src_l1[2] dst1[2] = amd_bytealign (src_r1[1], src_r1[0], 3);
| src_r0[0] << 24; dst1[1] = amd_bytealign (src_r1[0], src_r0[3], 3);
dst1[3] = amd_bytealign (src_r0[1], src_r0[0], 1); dst1[0] = amd_bytealign (src_r0[3], src_r0[2], 3);
dst0[3] = amd_bytealign (src_r0[2], src_r0[1], 3);
dst0[2] = amd_bytealign (src_r0[1], src_r0[0], 3);
dst0[1] = src_l0[1] | src_r0[0] << 8;
break; break;
case 4:
case 28: dst1[3] = src_r1[2];
dst1[3] = src_r0[0]; dst1[2] = src_r1[1];
dst1[1] = src_r1[0];
dst1[0] = src_r0[3];
dst0[3] = src_r0[2];
dst0[2] = src_r0[1];
dst0[1] = src_r0[0];
break; break;
case 3:
case 29: dst1[3] = amd_bytealign (src_r1[3], src_r1[2], 1);
dst1[3] = src_l1[3] dst1[2] = amd_bytealign (src_r1[2], src_r1[1], 1);
| src_r0[0] << 8; dst1[1] = amd_bytealign (src_r1[1], src_r1[0], 1);
dst1[0] = amd_bytealign (src_r1[0], src_r0[3], 1);
dst0[3] = amd_bytealign (src_r0[3], src_r0[2], 1);
dst0[2] = amd_bytealign (src_r0[2], src_r0[1], 1);
dst0[1] = amd_bytealign (src_r0[1], src_r0[0], 1);
dst0[0] = src_l0[0] | src_r0[0] << 24;
break; break;
case 2:
case 30: dst1[3] = amd_bytealign (src_r1[3], src_r1[2], 2);
dst1[3] = src_l1[3] dst1[2] = amd_bytealign (src_r1[2], src_r1[1], 2);
| src_r0[0] << 16; dst1[1] = amd_bytealign (src_r1[1], src_r1[0], 2);
dst1[0] = amd_bytealign (src_r1[0], src_r0[3], 2);
dst0[3] = amd_bytealign (src_r0[3], src_r0[2], 2);
dst0[2] = amd_bytealign (src_r0[2], src_r0[1], 2);
dst0[1] = amd_bytealign (src_r0[1], src_r0[0], 2);
dst0[0] = src_l0[0] | src_r0[0] << 16;
break; break;
case 1:
case 31: dst1[3] = amd_bytealign (src_r1[3], src_r1[2], 3);
dst1[3] = src_l1[3] dst1[2] = amd_bytealign (src_r1[2], src_r1[1], 3);
| src_r0[0] << 24; dst1[1] = amd_bytealign (src_r1[1], src_r1[0], 3);
dst1[0] = amd_bytealign (src_r1[0], src_r0[3], 3);
dst0[3] = amd_bytealign (src_r0[3], src_r0[2], 3);
dst0[2] = amd_bytealign (src_r0[2], src_r0[1], 3);
dst0[1] = amd_bytealign (src_r0[1], src_r0[0], 3);
dst0[0] = src_l0[0] | src_r0[0] << 8;
break;
case 0:
dst1[3] = src_r1[3];
dst1[2] = src_r1[2];
dst1[1] = src_r1[1];
dst1[0] = src_r1[0];
dst0[3] = src_r0[3];
dst0[2] = src_r0[2];
dst0[1] = src_r0[1];
dst0[0] = src_r0[0];
break; break;
} }
#endif #endif
@ -2038,19 +1983,7 @@ static u32 rule_op_mangle_dupeword (const u32 p0, const u32 p1, u32 buf0[4], u32
u32 out_len = in_len; u32 out_len = in_len;
u32 tib40[4]; append_block8 (out_len, buf0, buf1, buf0, buf1, buf0, buf1);
u32 tib41[4];
tib40[0] = buf0[0];
tib40[1] = buf0[1];
tib40[2] = buf0[2];
tib40[3] = buf0[3];
tib41[0] = buf1[0];
tib41[1] = buf1[1];
tib41[2] = buf1[2];
tib41[3] = buf1[3];
append_block8 (out_len, buf0, buf1, buf0, buf1, tib40, tib41);
out_len += in_len; out_len += in_len;

Loading…
Cancel
Save