From a22da36a008c5bc00badd32e31805a92919c5305 Mon Sep 17 00:00:00 2001 From: jsteube Date: Mon, 3 Jul 2017 10:41:09 +0200 Subject: [PATCH] Add different code generators for truncate_block(), add results to inc_common.cl and make use of them in m01800-pure.cl --- OpenCL/inc_common.cl | 1843 ++++++++++++++++- OpenCL/m01800-pure.cl | 35 +- .../GEN_truncate_block_16x4_be.pl | 42 + .../GEN_truncate_block_16x4_le.pl | 42 + .../GEN_truncate_block_4x4_be.pl | 42 + .../GEN_truncate_block_4x4_le.pl | 42 + 6 files changed, 2000 insertions(+), 46 deletions(-) create mode 100644 tools/code_generators/GEN_truncate_block_16x4_be.pl create mode 100644 tools/code_generators/GEN_truncate_block_16x4_le.pl create mode 100644 tools/code_generators/GEN_truncate_block_4x4_be.pl create mode 100644 tools/code_generators/GEN_truncate_block_4x4_le.pl diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl index e7a681a13..bdd45b7b3 100644 --- a/OpenCL/inc_common.cl +++ b/OpenCL/inc_common.cl @@ -201,24 +201,1835 @@ inline void truncate_block (u32x w[4], const u32 len) } } -inline void truncate_block_64 (u32x w[16], const u32 len) +inline void truncate_block_4x4_le (u32x w0[4], const u32 len) { - switch (len / 16) + switch (len) { - case 0: truncate_block (w + 0, len & 15); - truncate_block (w + 4, 0); - truncate_block (w + 8, 0); - truncate_block (w + 12, 0); - break; - case 1: truncate_block (w + 4, len & 15); - truncate_block (w + 8, 0); - truncate_block (w + 12, 0); - break; - case 2: truncate_block (w + 8, len & 15); - truncate_block (w + 12, 0); - break; - case 3: truncate_block (w + 12, len & 15); - break; + case 0: + w0[0] = 0; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + + break; + + case 1: + w0[0] &= 0x000000ff; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + + break; + + case 2: + w0[0] &= 0x0000ffff; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + + break; + + case 3: + w0[0] &= 0x00ffffff; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + + break; + + case 4: + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + + break; + + case 5: + w0[1] &= 0x000000ff; + w0[2] = 0; + w0[3] = 0; + + break; + + case 6: + w0[1] &= 0x0000ffff; + w0[2] = 0; + w0[3] = 0; + + break; + + case 7: + w0[1] &= 0x00ffffff; + w0[2] = 0; + w0[3] = 0; + + break; + + case 8: + w0[2] = 0; + w0[3] = 0; + + break; + + case 9: + w0[2] &= 0x000000ff; + w0[3] = 0; + + break; + + case 10: + w0[2] &= 0x0000ffff; + w0[3] = 0; + + break; + + case 11: + w0[2] &= 0x00ffffff; + w0[3] = 0; + + break; + + case 12: + w0[3] = 0; + + break; + + case 13: + w0[3] &= 0x000000ff; + + break; + + case 14: + w0[3] &= 0x0000ffff; + + break; + + case 15: + w0[3] &= 0x00ffffff; + + break; + } +} + +inline void truncate_block_16x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 len) +{ + switch (len) + { + case 0: + w0[0] = 0; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 1: + w0[0] &= 0x000000ff; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 2: + w0[0] &= 0x0000ffff; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 3: + w0[0] &= 0x00ffffff; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 4: + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 5: + w0[1] &= 0x000000ff; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 6: + w0[1] &= 0x0000ffff; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 7: + w0[1] &= 0x00ffffff; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 8: + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 9: + w0[2] &= 0x000000ff; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 10: + w0[2] &= 0x0000ffff; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 11: + w0[2] &= 0x00ffffff; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 12: + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 13: + w0[3] &= 0x000000ff; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 14: + w0[3] &= 0x0000ffff; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 15: + w0[3] &= 0x00ffffff; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 16: + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 17: + w1[0] &= 0x000000ff; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 18: + w1[0] &= 0x0000ffff; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 19: + w1[0] &= 0x00ffffff; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 20: + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 21: + w1[1] &= 0x000000ff; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 22: + w1[1] &= 0x0000ffff; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 23: + w1[1] &= 0x00ffffff; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 24: + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 25: + w1[2] &= 0x000000ff; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 26: + w1[2] &= 0x0000ffff; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 27: + w1[2] &= 0x00ffffff; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 28: + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 29: + w1[3] &= 0x000000ff; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 30: + w1[3] &= 0x0000ffff; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 31: + w1[3] &= 0x00ffffff; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 32: + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 33: + w2[0] &= 0x000000ff; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 34: + w2[0] &= 0x0000ffff; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 35: + w2[0] &= 0x00ffffff; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 36: + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 37: + w2[1] &= 0x000000ff; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 38: + w2[1] &= 0x0000ffff; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 39: + w2[1] &= 0x00ffffff; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 40: + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 41: + w2[2] &= 0x000000ff; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 42: + w2[2] &= 0x0000ffff; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 43: + w2[2] &= 0x00ffffff; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 44: + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 45: + w2[3] &= 0x000000ff; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 46: + w2[3] &= 0x0000ffff; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 47: + w2[3] &= 0x00ffffff; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 48: + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 49: + w3[0] &= 0x000000ff; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 50: + w3[0] &= 0x0000ffff; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 51: + w3[0] &= 0x00ffffff; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 52: + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 53: + w3[1] &= 0x000000ff; + w3[2] = 0; + w3[3] = 0; + + break; + + case 54: + w3[1] &= 0x0000ffff; + w3[2] = 0; + w3[3] = 0; + + break; + + case 55: + w3[1] &= 0x00ffffff; + w3[2] = 0; + w3[3] = 0; + + break; + + case 56: + w3[2] = 0; + w3[3] = 0; + + break; + + case 57: + w3[2] &= 0x000000ff; + w3[3] = 0; + + break; + + case 58: + w3[2] &= 0x0000ffff; + w3[3] = 0; + + break; + + case 59: + w3[2] &= 0x00ffffff; + w3[3] = 0; + + break; + + case 60: + w3[3] = 0; + + break; + + case 61: + w3[3] &= 0x000000ff; + + break; + + case 62: + w3[3] &= 0x0000ffff; + + break; + + case 63: + w3[3] &= 0x00ffffff; + + break; + } +} + +inline void truncate_block_4x4_be (u32x w0[4], const u32 len) +{ + switch (len) + { + case 0: + w0[0] = 0; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + + break; + + case 1: + w0[0] &= 0xff000000; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + + break; + + case 2: + w0[0] &= 0xffff0000; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + + break; + + case 3: + w0[0] &= 0xffffff00; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + + break; + + case 4: + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + + break; + + case 5: + w0[1] &= 0xff000000; + w0[2] = 0; + w0[3] = 0; + + break; + + case 6: + w0[1] &= 0xffff0000; + w0[2] = 0; + w0[3] = 0; + + break; + + case 7: + w0[1] &= 0xffffff00; + w0[2] = 0; + w0[3] = 0; + + break; + + case 8: + w0[2] = 0; + w0[3] = 0; + + break; + + case 9: + w0[2] &= 0xff000000; + w0[3] = 0; + + break; + + case 10: + w0[2] &= 0xffff0000; + w0[3] = 0; + + break; + + case 11: + w0[2] &= 0xffffff00; + w0[3] = 0; + + break; + + case 12: + w0[3] = 0; + + break; + + case 13: + w0[3] &= 0xff000000; + + break; + + case 14: + w0[3] &= 0xffff0000; + + break; + + case 15: + w0[3] &= 0xffffff00; + + break; + } +} + +inline void truncate_block_16x4_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 len) +{ + switch (len) + { + case 0: + w0[0] = 0; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 1: + w0[0] &= 0xff000000; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 2: + w0[0] &= 0xffff0000; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 3: + w0[0] &= 0xffffff00; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 4: + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 5: + w0[1] &= 0xff000000; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 6: + w0[1] &= 0xffff0000; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 7: + w0[1] &= 0xffffff00; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 8: + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 9: + w0[2] &= 0xff000000; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 10: + w0[2] &= 0xffff0000; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 11: + w0[2] &= 0xffffff00; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 12: + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 13: + w0[3] &= 0xff000000; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 14: + w0[3] &= 0xffff0000; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 15: + w0[3] &= 0xffffff00; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 16: + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 17: + w1[0] &= 0xff000000; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 18: + w1[0] &= 0xffff0000; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 19: + w1[0] &= 0xffffff00; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 20: + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 21: + w1[1] &= 0xff000000; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 22: + w1[1] &= 0xffff0000; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 23: + w1[1] &= 0xffffff00; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 24: + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 25: + w1[2] &= 0xff000000; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 26: + w1[2] &= 0xffff0000; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 27: + w1[2] &= 0xffffff00; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 28: + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 29: + w1[3] &= 0xff000000; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 30: + w1[3] &= 0xffff0000; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 31: + w1[3] &= 0xffffff00; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 32: + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 33: + w2[0] &= 0xff000000; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 34: + w2[0] &= 0xffff0000; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 35: + w2[0] &= 0xffffff00; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 36: + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 37: + w2[1] &= 0xff000000; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 38: + w2[1] &= 0xffff0000; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 39: + w2[1] &= 0xffffff00; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 40: + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 41: + w2[2] &= 0xff000000; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 42: + w2[2] &= 0xffff0000; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 43: + w2[2] &= 0xffffff00; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 44: + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 45: + w2[3] &= 0xff000000; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 46: + w2[3] &= 0xffff0000; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 47: + w2[3] &= 0xffffff00; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 48: + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 49: + w3[0] &= 0xff000000; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 50: + w3[0] &= 0xffff0000; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 51: + w3[0] &= 0xffffff00; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 52: + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + break; + + case 53: + w3[1] &= 0xff000000; + w3[2] = 0; + w3[3] = 0; + + break; + + case 54: + w3[1] &= 0xffff0000; + w3[2] = 0; + w3[3] = 0; + + break; + + case 55: + w3[1] &= 0xffffff00; + w3[2] = 0; + w3[3] = 0; + + break; + + case 56: + w3[2] = 0; + w3[3] = 0; + + break; + + case 57: + w3[2] &= 0xff000000; + w3[3] = 0; + + break; + + case 58: + w3[2] &= 0xffff0000; + w3[3] = 0; + + break; + + case 59: + w3[2] &= 0xffffff00; + w3[3] = 0; + + break; + + case 60: + w3[3] = 0; + + break; + + case 61: + w3[3] &= 0xff000000; + + break; + + case 62: + w3[3] &= 0xffff0000; + + break; + + case 63: + w3[3] &= 0xffffff00; + + break; } } diff --git a/OpenCL/m01800-pure.cl b/OpenCL/m01800-pure.cl index 4578dccaa..46270e3dc 100644 --- a/OpenCL/m01800-pure.cl +++ b/OpenCL/m01800-pure.cl @@ -118,14 +118,9 @@ __kernel void m01800_init (__global pw_t *pws, __global const kernel_rule_t *rul #ifdef _unroll #pragma unroll #endif - for (int i = 0; i < 16; i++) t_final[i] = swap32 (final[i]); + for (int i = 0; i < 16; i++) t_final[i] = final[i]; - truncate_block_64 (t_final, pl); - - #ifdef _unroll - #pragma unroll - #endif - for (int i = 0; i < 16; i++) t_final[i] = swap32 (t_final[i]); + truncate_block_16x4_be (t_final + 0, t_final + 4, t_final + 8, t_final + 12, pl); sha512_update (&ctx, t_final, pl); @@ -212,17 +207,7 @@ __kernel void m01800_init (__global pw_t *pws, __global const kernel_rule_t *rul p_final[idx + 15] = final[15]; } - #ifdef _unroll - #pragma unroll - #endif - for (int i = 0; i < 16; i++) final[i] = swap32 (final[i]); - - truncate_block_64 (final, pl); - - #ifdef _unroll - #pragma unroll - #endif - for (int i = 0; i < 16; i++) final[i] = swap32 (final[i]); + truncate_block_16x4_be (final + 0, final + 4, final + 8, final + 12, pl); p_final[idx + 0] = final[ 0]; p_final[idx + 1] = final[ 1]; @@ -250,7 +235,7 @@ __kernel void m01800_init (__global pw_t *pws, __global const kernel_rule_t *rul sha512_init (&ctx); - for (u32 j = 0; j < 16 + ((tmps[gid].alt_result[ 0] >> 24) & 0xff); j++) + for (u32 j = 0; j < 16 + (tmps[gid].alt_result[0] >> 24); j++) { sha512_update (&ctx, s, salt_len); } @@ -296,17 +281,7 @@ __kernel void m01800_init (__global pw_t *pws, __global const kernel_rule_t *rul s_final[idx + 15] = final[15]; } - #ifdef _unroll - #pragma unroll - #endif - for (int i = 0; i < 16; i++) final[i] = swap32 (final[i]); - - truncate_block_64 (final, pl); - - #ifdef _unroll - #pragma unroll - #endif - for (int i = 0; i < 16; i++) final[i] = swap32 (final[i]); + truncate_block_16x4_be (final + 0, final + 4, final + 8, final + 12, pl); s_final[idx + 0] = final[ 0]; s_final[idx + 1] = final[ 1]; diff --git a/tools/code_generators/GEN_truncate_block_16x4_be.pl b/tools/code_generators/GEN_truncate_block_16x4_be.pl new file mode 100644 index 000000000..41427e754 --- /dev/null +++ b/tools/code_generators/GEN_truncate_block_16x4_be.pl @@ -0,0 +1,42 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +for (my $i = 0; $i < 64; $i++) +{ + printf (" case %2d:\n", $i); + + my $id4 = int ($i / 4); + my $im4 = int ($i % 4); + + if ($im4 == 0) + { + printf (" w%d[%d] = 0;\n", $id4 / 4, $id4 % 4); + } + elsif ($im4 == 1) + { + printf (" w%d[%d] &= 0xff000000;\n", $id4 / 4, $id4 % 4); + } + elsif ($im4 == 2) + { + printf (" w%d[%d] &= 0xffff0000;\n", $id4 / 4, $id4 % 4); + } + elsif ($im4 == 3) + { + printf (" w%d[%d] &= 0xffffff00;\n", $id4 / 4, $id4 % 4); + } + + for (my $j = $id4 + 1; $j < 16; $j++) + { + my $jd4 = int ($j / 4); + my $jm4 = int ($j % 4); + + printf (" w%d[%d] = 0;\n", $jd4, $jm4); + } + + printf ("\n"); + + printf (" break;\n"); + printf ("\n"); +} diff --git a/tools/code_generators/GEN_truncate_block_16x4_le.pl b/tools/code_generators/GEN_truncate_block_16x4_le.pl new file mode 100644 index 000000000..ce17f8bcc --- /dev/null +++ b/tools/code_generators/GEN_truncate_block_16x4_le.pl @@ -0,0 +1,42 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +for (my $i = 0; $i < 64; $i++) +{ + printf (" case %2d:\n", $i); + + my $id4 = int ($i / 4); + my $im4 = int ($i % 4); + + if ($im4 == 0) + { + printf (" w%d[%d] = 0;\n", $id4 / 4, $id4 % 4); + } + elsif ($im4 == 1) + { + printf (" w%d[%d] &= 0x000000ff;\n", $id4 / 4, $id4 % 4); + } + elsif ($im4 == 2) + { + printf (" w%d[%d] &= 0x0000ffff;\n", $id4 / 4, $id4 % 4); + } + elsif ($im4 == 3) + { + printf (" w%d[%d] &= 0x00ffffff;\n", $id4 / 4, $id4 % 4); + } + + for (my $j = $id4 + 1; $j < 16; $j++) + { + my $jd4 = int ($j / 4); + my $jm4 = int ($j % 4); + + printf (" w%d[%d] = 0;\n", $jd4, $jm4); + } + + printf ("\n"); + + printf (" break;\n"); + printf ("\n"); +} diff --git a/tools/code_generators/GEN_truncate_block_4x4_be.pl b/tools/code_generators/GEN_truncate_block_4x4_be.pl new file mode 100644 index 000000000..e5b017841 --- /dev/null +++ b/tools/code_generators/GEN_truncate_block_4x4_be.pl @@ -0,0 +1,42 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +for (my $i = 0; $i < 16; $i++) +{ + printf (" case %2d:\n", $i); + + my $id4 = int ($i / 4); + my $im4 = int ($i % 4); + + if ($im4 == 0) + { + printf (" w%d[%d] = 0;\n", $id4 / 4, $id4 % 4); + } + elsif ($im4 == 1) + { + printf (" w%d[%d] &= 0xff000000;\n", $id4 / 4, $id4 % 4); + } + elsif ($im4 == 2) + { + printf (" w%d[%d] &= 0xffff0000;\n", $id4 / 4, $id4 % 4); + } + elsif ($im4 == 3) + { + printf (" w%d[%d] &= 0xffffff00;\n", $id4 / 4, $id4 % 4); + } + + for (my $j = $id4 + 1; $j < 4; $j++) + { + my $jd4 = int ($j / 4); + my $jm4 = int ($j % 4); + + printf (" w%d[%d] = 0;\n", $jd4, $jm4); + } + + printf ("\n"); + + printf (" break;\n"); + printf ("\n"); +} diff --git a/tools/code_generators/GEN_truncate_block_4x4_le.pl b/tools/code_generators/GEN_truncate_block_4x4_le.pl new file mode 100644 index 000000000..292120770 --- /dev/null +++ b/tools/code_generators/GEN_truncate_block_4x4_le.pl @@ -0,0 +1,42 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +for (my $i = 0; $i < 16; $i++) +{ + printf (" case %2d:\n", $i); + + my $id4 = int ($i / 4); + my $im4 = int ($i % 4); + + if ($im4 == 0) + { + printf (" w%d[%d] = 0;\n", $id4 / 4, $id4 % 4); + } + elsif ($im4 == 1) + { + printf (" w%d[%d] &= 0x000000ff;\n", $id4 / 4, $id4 % 4); + } + elsif ($im4 == 2) + { + printf (" w%d[%d] &= 0x0000ffff;\n", $id4 / 4, $id4 % 4); + } + elsif ($im4 == 3) + { + printf (" w%d[%d] &= 0x00ffffff;\n", $id4 / 4, $id4 % 4); + } + + for (my $j = $id4 + 1; $j < 4; $j++) + { + my $jd4 = int ($j / 4); + my $jm4 = int ($j % 4); + + printf (" w%d[%d] = 0;\n", $jd4, $jm4); + } + + printf ("\n"); + + printf (" break;\n"); + printf ("\n"); +}