From f843c6a4d05d12519ee7400e66dbbdc49722c50e Mon Sep 17 00:00:00 2001 From: jsteube Date: Thu, 26 May 2016 19:30:09 +0200 Subject: [PATCH] Improve Lotus Notes/Domino 5 performance --- OpenCL/m08600_a0.cl | 117 +++++----------------------------------- OpenCL/m08600_a1.cl | 127 +++++--------------------------------------- OpenCL/m08600_a3.cl | 119 +++++++---------------------------------- 3 files changed, 44 insertions(+), 319 deletions(-) diff --git a/OpenCL/m08600_a0.cl b/OpenCL/m08600_a0.cl index c8da5a799..0d3a6712b 100644 --- a/OpenCL/m08600_a0.cl +++ b/OpenCL/m08600_a0.cl @@ -122,16 +122,16 @@ void pad (u32 w[4], const u32 len) const u32 mask1 = val << 24; const u32 mask2 = val << 16 - | val << 24; + | val << 24; const u32 mask3 = val << 8 - | val << 16 - | val << 24; + | val << 16 + | val << 24; const u32 mask4 = val << 0 - | val << 8 - | val << 16 - | val << 24; + | val << 8 + | val << 16 + | val << 24; switch (len) { @@ -226,7 +226,7 @@ void mdtransform (u32x state[4], u32x checksum[4], u32x block[4], __local u32 *s lotus_transform_password (block, checksum, s_lotus_magic_table); } -void domino_big_md (const u32x saved_key[16], const u32 size, u32x state[4], __local u32 *s_lotus_magic_table) +void domino_big_md (const u32x saved_key[4], const u32 size, u32x state[4], __local u32 *s_lotus_magic_table) { u32x checksum[4]; @@ -235,32 +235,7 @@ void domino_big_md (const u32x saved_key[16], const u32 size, u32x state[4], __l checksum[2] = 0; checksum[3] = 0; - u32x block[4]; - - block[0] = 0; - block[1] = 0; - block[2] = 0; - block[3] = 0; - - u32 curpos; - u32 idx; - - for (curpos = 0, idx = 0; curpos + 16 < size; curpos += 16, idx += 4) - { - block[0] = saved_key[idx + 0]; - block[1] = saved_key[idx + 1]; - block[2] = saved_key[idx + 2]; - block[3] = saved_key[idx + 3]; - - mdtransform (state, checksum, block, s_lotus_magic_table); - } - - block[0] = saved_key[idx + 0]; - block[1] = saved_key[idx + 1]; - block[2] = saved_key[idx + 2]; - block[3] = saved_key[idx + 3]; - - mdtransform (state, checksum, block, s_lotus_magic_table); + mdtransform (state, checksum, saved_key, s_lotus_magic_table); mdtransform_norecalc (state, checksum, s_lotus_magic_table); } @@ -325,25 +300,6 @@ __kernel void m08600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * domino */ - u32x w[16]; - - w[ 0] = w0[0]; - w[ 1] = w0[1]; - w[ 2] = w0[2]; - w[ 3] = w0[3]; - w[ 4] = w1[0]; - w[ 5] = w1[1]; - w[ 6] = w1[2]; - w[ 7] = w1[3]; - w[ 8] = 0; - w[ 9] = 0; - w[10] = 0; - w[11] = 0; - w[12] = 0; - w[13] = 0; - w[14] = 0; - w[15] = 0; - u32x state[4]; state[0] = 0; @@ -355,24 +311,9 @@ __kernel void m08600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * padding */ - if (out_len < 16) - { - pad (&w[ 0], out_len & 0xf); - } - else if (out_len < 32) - { - pad (&w[ 4], out_len & 0xf); - } - else if (out_len < 48) - { - pad (&w[ 8], out_len & 0xf); - } - else if (out_len < 64) - { - pad (&w[12], out_len & 0xf); - } + pad (w0, out_len); - domino_big_md (w, out_len, state, s_lotus_magic_table); + domino_big_md (w0, out_len, state, s_lotus_magic_table); COMPARE_M_SIMD (state[0], state[1], state[2], state[3]); } @@ -458,25 +399,6 @@ __kernel void m08600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * domino */ - u32x w[16]; - - w[ 0] = w0[0]; - w[ 1] = w0[1]; - w[ 2] = w0[2]; - w[ 3] = w0[3]; - w[ 4] = w1[0]; - w[ 5] = w1[1]; - w[ 6] = w1[2]; - w[ 7] = w1[3]; - w[ 8] = 0; - w[ 9] = 0; - w[10] = 0; - w[11] = 0; - w[12] = 0; - w[13] = 0; - w[14] = 0; - w[15] = 0; - u32x state[4]; state[0] = 0; @@ -488,24 +410,9 @@ __kernel void m08600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * padding */ - if (out_len < 16) - { - pad (&w[ 0], out_len & 0xf); - } - else if (out_len < 32) - { - pad (&w[ 4], out_len & 0xf); - } - else if (out_len < 48) - { - pad (&w[ 8], out_len & 0xf); - } - else if (out_len < 64) - { - pad (&w[12], out_len & 0xf); - } + pad (w0, out_len); - domino_big_md (w, out_len, state, s_lotus_magic_table); + domino_big_md (w0, out_len, state, s_lotus_magic_table); COMPARE_S_SIMD (state[0], state[1], state[2], state[3]); } diff --git a/OpenCL/m08600_a1.cl b/OpenCL/m08600_a1.cl index bb08ebe61..91546bba3 100644 --- a/OpenCL/m08600_a1.cl +++ b/OpenCL/m08600_a1.cl @@ -119,16 +119,16 @@ void pad (u32 w[4], const u32 len) const u32 mask1 = val << 24; const u32 mask2 = val << 16 - | val << 24; + | val << 24; const u32 mask3 = val << 8 - | val << 16 - | val << 24; + | val << 16 + | val << 24; const u32 mask4 = val << 0 - | val << 8 - | val << 16 - | val << 24; + | val << 8 + | val << 16 + | val << 24; switch (len) { @@ -223,7 +223,7 @@ void mdtransform (u32x state[4], u32x checksum[4], u32x block[4], __local u32 *s lotus_transform_password (block, checksum, s_lotus_magic_table); } -void domino_big_md (const u32x saved_key[16], const u32 size, u32x state[4], __local u32 *s_lotus_magic_table) +void domino_big_md (const u32x saved_key[4], const u32 size, u32x state[4], __local u32 *s_lotus_magic_table) { u32x checksum[4]; @@ -232,32 +232,7 @@ void domino_big_md (const u32x saved_key[16], const u32 size, u32x state[4], __l checksum[2] = 0; checksum[3] = 0; - u32x block[4]; - - block[0] = 0; - block[1] = 0; - block[2] = 0; - block[3] = 0; - - u32 curpos; - u32 idx; - - for (curpos = 0, idx = 0; curpos + 16 < size; curpos += 16, idx += 4) - { - block[0] = saved_key[idx + 0]; - block[1] = saved_key[idx + 1]; - block[2] = saved_key[idx + 2]; - block[3] = saved_key[idx + 3]; - - mdtransform (state, checksum, block, s_lotus_magic_table); - } - - block[0] = saved_key[idx + 0]; - block[1] = saved_key[idx + 1]; - block[2] = saved_key[idx + 2]; - block[3] = saved_key[idx + 3]; - - mdtransform (state, checksum, block, s_lotus_magic_table); + mdtransform (state, checksum, saved_key, s_lotus_magic_table); mdtransform_norecalc (state, checksum, s_lotus_magic_table); } @@ -357,40 +332,16 @@ __kernel void m08600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, } u32x w0[4]; - u32x w1[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - w1[0] = wordl1[0] | wordr1[0]; - w1[1] = wordl1[1] | wordr1[1]; - w1[2] = wordl1[2] | wordr1[2]; - w1[3] = wordl1[3] | wordr1[3]; /** * domino */ - u32x w[16]; - - w[ 0] = w0[0]; - w[ 1] = w0[1]; - w[ 2] = w0[2]; - w[ 3] = w0[3]; - w[ 4] = w1[0]; - w[ 5] = w1[1]; - w[ 6] = w1[2]; - w[ 7] = w1[3]; - w[ 8] = 0; - w[ 9] = 0; - w[10] = 0; - w[11] = 0; - w[12] = 0; - w[13] = 0; - w[14] = 0; - w[15] = 0; - u32x state[4]; state[0] = 0; @@ -402,24 +353,9 @@ __kernel void m08600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * padding */ - if (pw_len < 16) - { - pad (&w[ 0], pw_len & 0xf); - } - else if (pw_len < 32) - { - pad (&w[ 4], pw_len & 0xf); - } - else if (pw_len < 48) - { - pad (&w[ 8], pw_len & 0xf); - } - else if (pw_len < 64) - { - pad (&w[12], pw_len & 0xf); - } + pad (w0, pw_len); - domino_big_md (w, pw_len, state, s_lotus_magic_table); + domino_big_md (w0, pw_len, state, s_lotus_magic_table); COMPARE_M_SIMD (state[0], state[1], state[2], state[3]); } @@ -540,40 +476,16 @@ __kernel void m08600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, } u32x w0[4]; - u32x w1[4]; w0[0] = wordl0[0] | wordr0[0]; w0[1] = wordl0[1] | wordr0[1]; w0[2] = wordl0[2] | wordr0[2]; w0[3] = wordl0[3] | wordr0[3]; - w1[0] = wordl1[0] | wordr1[0]; - w1[1] = wordl1[1] | wordr1[1]; - w1[2] = wordl1[2] | wordr1[2]; - w1[3] = wordl1[3] | wordr1[3]; /** * domino */ - u32x w[16]; - - w[ 0] = w0[0]; - w[ 1] = w0[1]; - w[ 2] = w0[2]; - w[ 3] = w0[3]; - w[ 4] = w1[0]; - w[ 5] = w1[1]; - w[ 6] = w1[2]; - w[ 7] = w1[3]; - w[ 8] = 0; - w[ 9] = 0; - w[10] = 0; - w[11] = 0; - w[12] = 0; - w[13] = 0; - w[14] = 0; - w[15] = 0; - u32x state[4]; state[0] = 0; @@ -585,24 +497,9 @@ __kernel void m08600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * padding */ - if (pw_len < 16) - { - pad (&w[ 0], pw_len & 0xf); - } - else if (pw_len < 32) - { - pad (&w[ 4], pw_len & 0xf); - } - else if (pw_len < 48) - { - pad (&w[ 8], pw_len & 0xf); - } - else if (pw_len < 64) - { - pad (&w[12], pw_len & 0xf); - } + pad (w0, pw_len); - domino_big_md (w, pw_len, state, s_lotus_magic_table); + domino_big_md (w0, pw_len, state, s_lotus_magic_table); COMPARE_S_SIMD (state[0], state[1], state[2], state[3]); } diff --git a/OpenCL/m08600_a3.cl b/OpenCL/m08600_a3.cl index 68643492a..c77019c91 100644 --- a/OpenCL/m08600_a3.cl +++ b/OpenCL/m08600_a3.cl @@ -119,16 +119,16 @@ void pad (u32 w[4], const u32 len) const u32 mask1 = val << 24; const u32 mask2 = val << 16 - | val << 24; + | val << 24; const u32 mask3 = val << 8 - | val << 16 - | val << 24; + | val << 16 + | val << 24; const u32 mask4 = val << 0 - | val << 8 - | val << 16 - | val << 24; + | val << 8 + | val << 16 + | val << 24; switch (len) { @@ -223,7 +223,7 @@ void mdtransform (u32x state[4], u32x checksum[4], u32x block[4], __local u32 *s lotus_transform_password (block, checksum, s_lotus_magic_table); } -void domino_big_md (const u32x saved_key[16], const u32 size, u32x state[4], __local u32 *s_lotus_magic_table) +void domino_big_md (const u32x saved_key[4], const u32 size, u32x state[4], __local u32 *s_lotus_magic_table) { u32x checksum[4]; @@ -232,32 +232,7 @@ void domino_big_md (const u32x saved_key[16], const u32 size, u32x state[4], __l checksum[2] = 0; checksum[3] = 0; - u32x block[4]; - - block[0] = 0; - block[1] = 0; - block[2] = 0; - block[3] = 0; - - u32 curpos; - u32 idx; - - for (curpos = 0, idx = 0; curpos + 16 < size; curpos += 16, idx += 4) - { - block[0] = saved_key[idx + 0]; - block[1] = saved_key[idx + 1]; - block[2] = saved_key[idx + 2]; - block[3] = saved_key[idx + 3]; - - mdtransform (state, checksum, block, s_lotus_magic_table); - } - - block[0] = saved_key[idx + 0]; - block[1] = saved_key[idx + 1]; - block[2] = saved_key[idx + 2]; - block[3] = saved_key[idx + 3]; - - mdtransform (state, checksum, block, s_lotus_magic_table); + mdtransform (state, checksum, saved_key, s_lotus_magic_table); mdtransform_norecalc (state, checksum, s_lotus_magic_table); } @@ -275,22 +250,7 @@ void m08600m (__local u32 *s_lotus_magic_table, u32 w[16], const u32 pw_len, __g * base */ - if (pw_len < 16) - { - pad (&w[ 0], pw_len & 0xf); - } - else if (pw_len < 32) - { - pad (&w[ 4], pw_len & 0xf); - } - else if (pw_len < 48) - { - pad (&w[ 8], pw_len & 0xf); - } - else if (pw_len < 64) - { - pad (&w[12], pw_len & 0xf); - } + pad (&w[ 0], pw_len); /** * loop @@ -304,24 +264,12 @@ void m08600m (__local u32 *s_lotus_magic_table, u32 w[16], const u32 pw_len, __g const u32x w0lr = w0l | w0r; - u32x w_t[16]; + u32x w_t[4]; - w_t[ 0] = w0lr; - w_t[ 1] = w[ 1]; - w_t[ 2] = w[ 2]; - w_t[ 3] = w[ 3]; - w_t[ 4] = w[ 4]; - w_t[ 5] = w[ 5]; - w_t[ 6] = w[ 6]; - w_t[ 7] = w[ 7]; - w_t[ 8] = w[ 8]; - w_t[ 9] = w[ 9]; - w_t[10] = w[10]; - w_t[11] = w[11]; - w_t[12] = w[12]; - w_t[13] = w[13]; - w_t[14] = w[14]; - w_t[15] = w[15]; + w_t[0] = w0lr; + w_t[1] = w[ 1]; + w_t[2] = w[ 2]; + w_t[3] = w[ 3]; u32x state[4]; @@ -349,22 +297,7 @@ void m08600s (__local u32 *s_lotus_magic_table, u32 w[16], const u32 pw_len, __g * base */ - if (pw_len < 16) - { - pad (&w[ 0], pw_len & 0xf); - } - else if (pw_len < 32) - { - pad (&w[ 4], pw_len & 0xf); - } - else if (pw_len < 48) - { - pad (&w[ 8], pw_len & 0xf); - } - else if (pw_len < 64) - { - pad (&w[12], pw_len & 0xf); - } + pad (&w[0], pw_len); /** * digest @@ -390,24 +323,12 @@ void m08600s (__local u32 *s_lotus_magic_table, u32 w[16], const u32 pw_len, __g const u32x w0lr = w0l | w0r; - u32x w_t[16]; + u32x w_t[4]; - w_t[ 0] = w0lr; - w_t[ 1] = w[ 1]; - w_t[ 2] = w[ 2]; - w_t[ 3] = w[ 3]; - w_t[ 4] = w[ 4]; - w_t[ 5] = w[ 5]; - w_t[ 6] = w[ 6]; - w_t[ 7] = w[ 7]; - w_t[ 8] = w[ 8]; - w_t[ 9] = w[ 9]; - w_t[10] = w[10]; - w_t[11] = w[11]; - w_t[12] = w[12]; - w_t[13] = w[13]; - w_t[14] = w[14]; - w_t[15] = w[15]; + w_t[0] = w0lr; + w_t[1] = w[ 1]; + w_t[2] = w[ 2]; + w_t[3] = w[ 3]; u32x state[4];