diff --git a/OpenCL/m00020_a3-optimized.cl b/OpenCL/m00020_a3-optimized.cl index 002119f5e..6a8268d27 100644 --- a/OpenCL/m00020_a3-optimized.cl +++ b/OpenCL/m00020_a3-optimized.cl @@ -52,63 +52,57 @@ void m00020m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl const u32 pw_salt_len = pw_len + salt_len; /** - * prepend salt + * loop */ const u32 w0l = w0[0]; - switch_buffer_by_offset_le_S (w0, w1, w2, w3, salt_len); - - w0[0] |= salt_buf0[0]; - w0[1] |= salt_buf0[1]; - w0[2] |= salt_buf0[2]; - w0[3] |= salt_buf0[3]; - w1[0] |= salt_buf1[0]; - w1[1] |= salt_buf1[1]; - w1[2] |= salt_buf1[2]; - w1[3] |= salt_buf1[3]; - w2[0] |= salt_buf2[0]; - w2[1] |= salt_buf2[1]; - w2[2] |= salt_buf2[2]; - w2[3] |= salt_buf2[3]; - w3[0] |= salt_buf3[0]; - w3[1] |= salt_buf3[1]; - w3[2] |= salt_buf3[2]; - w3[3] |= salt_buf3[3]; - - u32x t0[4]; - u32x t1[4]; - u32x t2[4]; - u32x t3[4]; - - t0[0] = w0[0]; - t0[1] = w0[1]; - t0[2] = w0[2]; - t0[3] = w0[3]; - t1[0] = w1[0]; - t1[1] = w1[1]; - t1[2] = w1[2]; - t1[3] = w1[3]; - t2[0] = w2[0]; - t2[1] = w2[1]; - t2[2] = w2[2]; - t2[3] = w2[3]; - t3[0] = w3[0]; - t3[1] = w3[1]; - t3[2] = pw_salt_len * 8; - t3[3] = 0; - - /** - * loop - */ - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { const u32x w0r = ix_create_bft (bfs_buf, il_pos); const u32x w0lr = w0l | w0r; - overwrite_at_le_4x4 (t0, t1, t2, t3, w0lr, salt_len); + u32x t0[4]; + u32x t1[4]; + u32x t2[4]; + u32x t3[4]; + + t0[0] = w0lr; + t0[1] = w0[1]; + t0[2] = w0[2]; + t0[3] = w0[3]; + t1[0] = w1[0]; + t1[1] = w1[1]; + t1[2] = w1[2]; + t1[3] = w1[3]; + t2[0] = w2[0]; + t2[1] = w2[1]; + t2[2] = w2[2]; + t2[3] = w2[3]; + t3[0] = w3[0]; + t3[1] = w3[1]; + t3[2] = w3[2]; + t3[3] = w3[3]; + + switch_buffer_by_offset_le (t0, t1, t2, t3, salt_len); + + t0[0] |= salt_buf0[0]; + t0[1] |= salt_buf0[1]; + t0[2] |= salt_buf0[2]; + t0[3] |= salt_buf0[3]; + t1[0] |= salt_buf1[0]; + t1[1] |= salt_buf1[1]; + t1[2] |= salt_buf1[2]; + t1[3] |= salt_buf1[3]; + t2[0] |= salt_buf2[0]; + t2[1] |= salt_buf2[1]; + t2[2] |= salt_buf2[2]; + t2[3] |= salt_buf2[3]; + t3[0] |= salt_buf3[0]; + t3[1] |= salt_buf3[1]; + t3[2] = pw_salt_len * 8; + t3[3] = 0; /** * md5 @@ -243,63 +237,57 @@ void m00020s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl const u32 pw_salt_len = pw_len + salt_len; /** - * prepend salt + * loop */ const u32 w0l = w0[0]; - switch_buffer_by_offset_le_S (w0, w1, w2, w3, salt_len); - - w0[0] |= salt_buf0[0]; - w0[1] |= salt_buf0[1]; - w0[2] |= salt_buf0[2]; - w0[3] |= salt_buf0[3]; - w1[0] |= salt_buf1[0]; - w1[1] |= salt_buf1[1]; - w1[2] |= salt_buf1[2]; - w1[3] |= salt_buf1[3]; - w2[0] |= salt_buf2[0]; - w2[1] |= salt_buf2[1]; - w2[2] |= salt_buf2[2]; - w2[3] |= salt_buf2[3]; - w3[0] |= salt_buf3[0]; - w3[1] |= salt_buf3[1]; - w3[2] |= salt_buf3[2]; - w3[3] |= salt_buf3[3]; - - u32x t0[4]; - u32x t1[4]; - u32x t2[4]; - u32x t3[4]; - - t0[0] = w0[0]; - t0[1] = w0[1]; - t0[2] = w0[2]; - t0[3] = w0[3]; - t1[0] = w1[0]; - t1[1] = w1[1]; - t1[2] = w1[2]; - t1[3] = w1[3]; - t2[0] = w2[0]; - t2[1] = w2[1]; - t2[2] = w2[2]; - t2[3] = w2[3]; - t3[0] = w3[0]; - t3[1] = w3[1]; - t3[2] = pw_salt_len * 8; - t3[3] = 0; - - /** - * loop - */ - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { const u32x w0r = ix_create_bft (bfs_buf, il_pos); const u32x w0lr = w0l | w0r; - overwrite_at_le_4x4 (t0, t1, t2, t3, w0lr, salt_len); + u32x t0[4]; + u32x t1[4]; + u32x t2[4]; + u32x t3[4]; + + t0[0] = w0lr; + t0[1] = w0[1]; + t0[2] = w0[2]; + t0[3] = w0[3]; + t1[0] = w1[0]; + t1[1] = w1[1]; + t1[2] = w1[2]; + t1[3] = w1[3]; + t2[0] = w2[0]; + t2[1] = w2[1]; + t2[2] = w2[2]; + t2[3] = w2[3]; + t3[0] = w3[0]; + t3[1] = w3[1]; + t3[2] = w3[2]; + t3[3] = w3[3]; + + switch_buffer_by_offset_le (t0, t1, t2, t3, salt_len); + + t0[0] |= salt_buf0[0]; + t0[1] |= salt_buf0[1]; + t0[2] |= salt_buf0[2]; + t0[3] |= salt_buf0[3]; + t1[0] |= salt_buf1[0]; + t1[1] |= salt_buf1[1]; + t1[2] |= salt_buf1[2]; + t1[3] |= salt_buf1[3]; + t2[0] |= salt_buf2[0]; + t2[1] |= salt_buf2[1]; + t2[2] |= salt_buf2[2]; + t2[3] |= salt_buf2[3]; + t3[0] |= salt_buf3[0]; + t3[1] |= salt_buf3[1]; + t3[2] = pw_salt_len * 8; + t3[3] = 0; /** * md5 diff --git a/OpenCL/m00040_a3-optimized.cl b/OpenCL/m00040_a3-optimized.cl index 8017e0ca9..063b851a1 100644 --- a/OpenCL/m00040_a3-optimized.cl +++ b/OpenCL/m00040_a3-optimized.cl @@ -52,63 +52,57 @@ void m00040m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl const u32 pw_salt_len = pw_len + salt_len; /** - * prepend salt + * loop */ const u32 w0l = w0[0]; - switch_buffer_by_offset_le_S (w0, w1, w2, w3, salt_len); - - w0[0] |= salt_buf0[0]; - w0[1] |= salt_buf0[1]; - w0[2] |= salt_buf0[2]; - w0[3] |= salt_buf0[3]; - w1[0] |= salt_buf1[0]; - w1[1] |= salt_buf1[1]; - w1[2] |= salt_buf1[2]; - w1[3] |= salt_buf1[3]; - w2[0] |= salt_buf2[0]; - w2[1] |= salt_buf2[1]; - w2[2] |= salt_buf2[2]; - w2[3] |= salt_buf2[3]; - w3[0] |= salt_buf3[0]; - w3[1] |= salt_buf3[1]; - w3[2] |= salt_buf3[2]; - w3[3] |= salt_buf3[3]; - - u32x t0[4]; - u32x t1[4]; - u32x t2[4]; - u32x t3[4]; - - t0[0] = w0[0]; - t0[1] = w0[1]; - t0[2] = w0[2]; - t0[3] = w0[3]; - t1[0] = w1[0]; - t1[1] = w1[1]; - t1[2] = w1[2]; - t1[3] = w1[3]; - t2[0] = w2[0]; - t2[1] = w2[1]; - t2[2] = w2[2]; - t2[3] = w2[3]; - t3[0] = w3[0]; - t3[1] = w3[1]; - t3[2] = pw_salt_len * 8; - t3[3] = 0; - - /** - * loop - */ - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { const u32x w0r = ix_create_bft (bfs_buf, il_pos); const u32x w0lr = w0l | w0r; - overwrite_at_le_4x4 (t0, t1, t2, t3, w0lr, salt_len); + u32x t0[4]; + u32x t1[4]; + u32x t2[4]; + u32x t3[4]; + + t0[0] = w0lr; + t0[1] = w0[1]; + t0[2] = w0[2]; + t0[3] = w0[3]; + t1[0] = w1[0]; + t1[1] = w1[1]; + t1[2] = w1[2]; + t1[3] = w1[3]; + t2[0] = w2[0]; + t2[1] = w2[1]; + t2[2] = w2[2]; + t2[3] = w2[3]; + t3[0] = w3[0]; + t3[1] = w3[1]; + t3[2] = w3[2]; + t3[3] = w3[3]; + + switch_buffer_by_offset_le (t0, t1, t2, t3, salt_len); + + t0[0] |= salt_buf0[0]; + t0[1] |= salt_buf0[1]; + t0[2] |= salt_buf0[2]; + t0[3] |= salt_buf0[3]; + t1[0] |= salt_buf1[0]; + t1[1] |= salt_buf1[1]; + t1[2] |= salt_buf1[2]; + t1[3] |= salt_buf1[3]; + t2[0] |= salt_buf2[0]; + t2[1] |= salt_buf2[1]; + t2[2] |= salt_buf2[2]; + t2[3] |= salt_buf2[3]; + t3[0] |= salt_buf3[0]; + t3[1] |= salt_buf3[1]; + t3[2] = pw_salt_len * 8; + t3[3] = 0; /** * md5 @@ -243,63 +237,57 @@ void m00040s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl const u32 pw_salt_len = pw_len + salt_len; /** - * prepend salt + * loop */ const u32 w0l = w0[0]; - switch_buffer_by_offset_le_S (w0, w1, w2, w3, salt_len); - - w0[0] |= salt_buf0[0]; - w0[1] |= salt_buf0[1]; - w0[2] |= salt_buf0[2]; - w0[3] |= salt_buf0[3]; - w1[0] |= salt_buf1[0]; - w1[1] |= salt_buf1[1]; - w1[2] |= salt_buf1[2]; - w1[3] |= salt_buf1[3]; - w2[0] |= salt_buf2[0]; - w2[1] |= salt_buf2[1]; - w2[2] |= salt_buf2[2]; - w2[3] |= salt_buf2[3]; - w3[0] |= salt_buf3[0]; - w3[1] |= salt_buf3[1]; - w3[2] |= salt_buf3[2]; - w3[3] |= salt_buf3[3]; - - u32x t0[4]; - u32x t1[4]; - u32x t2[4]; - u32x t3[4]; - - t0[0] = w0[0]; - t0[1] = w0[1]; - t0[2] = w0[2]; - t0[3] = w0[3]; - t1[0] = w1[0]; - t1[1] = w1[1]; - t1[2] = w1[2]; - t1[3] = w1[3]; - t2[0] = w2[0]; - t2[1] = w2[1]; - t2[2] = w2[2]; - t2[3] = w2[3]; - t3[0] = w3[0]; - t3[1] = w3[1]; - t3[2] = pw_salt_len * 8; - t3[3] = 0; - - /** - * loop - */ - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { const u32x w0r = ix_create_bft (bfs_buf, il_pos); const u32x w0lr = w0l | w0r; - overwrite_at_le_4x4 (t0, t1, t2, t3, w0lr, salt_len); + u32x t0[4]; + u32x t1[4]; + u32x t2[4]; + u32x t3[4]; + + t0[0] = w0lr; + t0[1] = w0[1]; + t0[2] = w0[2]; + t0[3] = w0[3]; + t1[0] = w1[0]; + t1[1] = w1[1]; + t1[2] = w1[2]; + t1[3] = w1[3]; + t2[0] = w2[0]; + t2[1] = w2[1]; + t2[2] = w2[2]; + t2[3] = w2[3]; + t3[0] = w3[0]; + t3[1] = w3[1]; + t3[2] = w3[2]; + t3[3] = w3[3]; + + switch_buffer_by_offset_le (t0, t1, t2, t3, salt_len); + + t0[0] |= salt_buf0[0]; + t0[1] |= salt_buf0[1]; + t0[2] |= salt_buf0[2]; + t0[3] |= salt_buf0[3]; + t1[0] |= salt_buf1[0]; + t1[1] |= salt_buf1[1]; + t1[2] |= salt_buf1[2]; + t1[3] |= salt_buf1[3]; + t2[0] |= salt_buf2[0]; + t2[1] |= salt_buf2[1]; + t2[2] |= salt_buf2[2]; + t2[3] |= salt_buf2[3]; + t3[0] |= salt_buf3[0]; + t3[1] |= salt_buf3[1]; + t3[2] = pw_salt_len * 8; + t3[3] = 0; /** * md5 diff --git a/OpenCL/m00120_a3-optimized.cl b/OpenCL/m00120_a3-optimized.cl index 2170c91f7..011fdbeba 100644 --- a/OpenCL/m00120_a3-optimized.cl +++ b/OpenCL/m00120_a3-optimized.cl @@ -52,63 +52,57 @@ void m00120m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl const u32 pw_salt_len = pw_len + salt_len; /** - * prepend salt + * loop */ const u32 w0l = w0[0]; - switch_buffer_by_offset_be_S (w0, w1, w2, w3, salt_len); - - w0[0] |= salt_buf0[0]; - w0[1] |= salt_buf0[1]; - w0[2] |= salt_buf0[2]; - w0[3] |= salt_buf0[3]; - w1[0] |= salt_buf1[0]; - w1[1] |= salt_buf1[1]; - w1[2] |= salt_buf1[2]; - w1[3] |= salt_buf1[3]; - w2[0] |= salt_buf2[0]; - w2[1] |= salt_buf2[1]; - w2[2] |= salt_buf2[2]; - w2[3] |= salt_buf2[3]; - w3[0] |= salt_buf3[0]; - w3[1] |= salt_buf3[1]; - w3[2] |= salt_buf3[2]; - w3[3] |= salt_buf3[3]; - - u32x t0[4]; - u32x t1[4]; - u32x t2[4]; - u32x t3[4]; - - t0[0] = w0[0]; - t0[1] = w0[1]; - t0[2] = w0[2]; - t0[3] = w0[3]; - t1[0] = w1[0]; - t1[1] = w1[1]; - t1[2] = w1[2]; - t1[3] = w1[3]; - t2[0] = w2[0]; - t2[1] = w2[1]; - t2[2] = w2[2]; - t2[3] = w2[3]; - t3[0] = w3[0]; - t3[1] = w3[1]; - t3[2] = w3[2]; - t3[3] = w3[3]; - - /** - * loop - */ - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { const u32x w0r = ix_create_bft (bfs_buf, il_pos); const u32x w0lr = w0l | w0r; - overwrite_at_be_4x4 (t0, t1, t2, t3, w0lr, salt_len); + u32x t0[4]; + u32x t1[4]; + u32x t2[4]; + u32x t3[4]; + + t0[0] = w0lr; + t0[1] = w0[1]; + t0[2] = w0[2]; + t0[3] = w0[3]; + t1[0] = w1[0]; + t1[1] = w1[1]; + t1[2] = w1[2]; + t1[3] = w1[3]; + t2[0] = w2[0]; + t2[1] = w2[1]; + t2[2] = w2[2]; + t2[3] = w2[3]; + t3[0] = w3[0]; + t3[1] = w3[1]; + t3[2] = w3[2]; + t3[3] = w3[3]; + + switch_buffer_by_offset_be (t0, t1, t2, t3, salt_len); + + t0[0] |= salt_buf0[0]; + t0[1] |= salt_buf0[1]; + t0[2] |= salt_buf0[2]; + t0[3] |= salt_buf0[3]; + t1[0] |= salt_buf1[0]; + t1[1] |= salt_buf1[1]; + t1[2] |= salt_buf1[2]; + t1[3] |= salt_buf1[3]; + t2[0] |= salt_buf2[0]; + t2[1] |= salt_buf2[1]; + t2[2] |= salt_buf2[2]; + t2[3] |= salt_buf2[3]; + t3[0] |= salt_buf3[0]; + t3[1] |= salt_buf3[1]; + t3[2] = 0; + t3[3] = pw_salt_len * 8; /** * sha1 @@ -295,63 +289,57 @@ void m00120s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl const u32 pw_salt_len = pw_len + salt_len; /** - * prepend salt + * loop */ const u32 w0l = w0[0]; - switch_buffer_by_offset_be_S (w0, w1, w2, w3, salt_len); - - w0[0] |= salt_buf0[0]; - w0[1] |= salt_buf0[1]; - w0[2] |= salt_buf0[2]; - w0[3] |= salt_buf0[3]; - w1[0] |= salt_buf1[0]; - w1[1] |= salt_buf1[1]; - w1[2] |= salt_buf1[2]; - w1[3] |= salt_buf1[3]; - w2[0] |= salt_buf2[0]; - w2[1] |= salt_buf2[1]; - w2[2] |= salt_buf2[2]; - w2[3] |= salt_buf2[3]; - w3[0] |= salt_buf3[0]; - w3[1] |= salt_buf3[1]; - w3[2] |= salt_buf3[2]; - w3[3] |= salt_buf3[3]; - - u32x t0[4]; - u32x t1[4]; - u32x t2[4]; - u32x t3[4]; - - t0[0] = w0[0]; - t0[1] = w0[1]; - t0[2] = w0[2]; - t0[3] = w0[3]; - t1[0] = w1[0]; - t1[1] = w1[1]; - t1[2] = w1[2]; - t1[3] = w1[3]; - t2[0] = w2[0]; - t2[1] = w2[1]; - t2[2] = w2[2]; - t2[3] = w2[3]; - t3[0] = w3[0]; - t3[1] = w3[1]; - t3[2] = w3[2]; - t3[3] = w3[3]; - - /** - * loop - */ - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { const u32x w0r = ix_create_bft (bfs_buf, il_pos); const u32x w0lr = w0l | w0r; - overwrite_at_be_4x4 (t0, t1, t2, t3, w0lr, salt_len); + u32x t0[4]; + u32x t1[4]; + u32x t2[4]; + u32x t3[4]; + + t0[0] = w0lr; + t0[1] = w0[1]; + t0[2] = w0[2]; + t0[3] = w0[3]; + t1[0] = w1[0]; + t1[1] = w1[1]; + t1[2] = w1[2]; + t1[3] = w1[3]; + t2[0] = w2[0]; + t2[1] = w2[1]; + t2[2] = w2[2]; + t2[3] = w2[3]; + t3[0] = w3[0]; + t3[1] = w3[1]; + t3[2] = w3[2]; + t3[3] = w3[3]; + + switch_buffer_by_offset_be (t0, t1, t2, t3, salt_len); + + t0[0] |= salt_buf0[0]; + t0[1] |= salt_buf0[1]; + t0[2] |= salt_buf0[2]; + t0[3] |= salt_buf0[3]; + t1[0] |= salt_buf1[0]; + t1[1] |= salt_buf1[1]; + t1[2] |= salt_buf1[2]; + t1[3] |= salt_buf1[3]; + t2[0] |= salt_buf2[0]; + t2[1] |= salt_buf2[1]; + t2[2] |= salt_buf2[2]; + t2[3] |= salt_buf2[3]; + t3[0] |= salt_buf3[0]; + t3[1] |= salt_buf3[1]; + t3[2] = 0; + t3[3] = pw_salt_len * 8; /** * sha1 diff --git a/OpenCL/m00140_a3-optimized.cl b/OpenCL/m00140_a3-optimized.cl index a304aa880..79f98c67d 100644 --- a/OpenCL/m00140_a3-optimized.cl +++ b/OpenCL/m00140_a3-optimized.cl @@ -52,63 +52,57 @@ void m00140m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl const u32 pw_salt_len = pw_len + salt_len; /** - * prepend salt + * loop */ const u32 w0l = w0[0]; - switch_buffer_by_offset_be_S (w0, w1, w2, w3, salt_len); - - w0[0] |= salt_buf0[0]; - w0[1] |= salt_buf0[1]; - w0[2] |= salt_buf0[2]; - w0[3] |= salt_buf0[3]; - w1[0] |= salt_buf1[0]; - w1[1] |= salt_buf1[1]; - w1[2] |= salt_buf1[2]; - w1[3] |= salt_buf1[3]; - w2[0] |= salt_buf2[0]; - w2[1] |= salt_buf2[1]; - w2[2] |= salt_buf2[2]; - w2[3] |= salt_buf2[3]; - w3[0] |= salt_buf3[0]; - w3[1] |= salt_buf3[1]; - w3[2] |= salt_buf3[2]; - w3[3] |= salt_buf3[3]; - - u32x t0[4]; - u32x t1[4]; - u32x t2[4]; - u32x t3[4]; - - t0[0] = w0[0]; - t0[1] = w0[1]; - t0[2] = w0[2]; - t0[3] = w0[3]; - t1[0] = w1[0]; - t1[1] = w1[1]; - t1[2] = w1[2]; - t1[3] = w1[3]; - t2[0] = w2[0]; - t2[1] = w2[1]; - t2[2] = w2[2]; - t2[3] = w2[3]; - t3[0] = w3[0]; - t3[1] = w3[1]; - t3[2] = w3[2]; - t3[3] = w3[3]; - - /** - * loop - */ - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { const u32x w0r = ix_create_bft (bfs_buf, il_pos); const u32x w0lr = w0l | w0r; - overwrite_at_be_4x4 (t0, t1, t2, t3, w0lr, salt_len); + u32x t0[4]; + u32x t1[4]; + u32x t2[4]; + u32x t3[4]; + + t0[0] = w0lr; + t0[1] = w0[1]; + t0[2] = w0[2]; + t0[3] = w0[3]; + t1[0] = w1[0]; + t1[1] = w1[1]; + t1[2] = w1[2]; + t1[3] = w1[3]; + t2[0] = w2[0]; + t2[1] = w2[1]; + t2[2] = w2[2]; + t2[3] = w2[3]; + t3[0] = w3[0]; + t3[1] = w3[1]; + t3[2] = w3[2]; + t3[3] = w3[3]; + + switch_buffer_by_offset_be (t0, t1, t2, t3, salt_len); + + t0[0] |= salt_buf0[0]; + t0[1] |= salt_buf0[1]; + t0[2] |= salt_buf0[2]; + t0[3] |= salt_buf0[3]; + t1[0] |= salt_buf1[0]; + t1[1] |= salt_buf1[1]; + t1[2] |= salt_buf1[2]; + t1[3] |= salt_buf1[3]; + t2[0] |= salt_buf2[0]; + t2[1] |= salt_buf2[1]; + t2[2] |= salt_buf2[2]; + t2[3] |= salt_buf2[3]; + t3[0] |= salt_buf3[0]; + t3[1] |= salt_buf3[1]; + t3[2] = 0; + t3[3] = pw_salt_len * 8; /** * sha1 @@ -295,63 +289,57 @@ void m00140s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl const u32 pw_salt_len = pw_len + salt_len; /** - * prepend salt + * loop */ const u32 w0l = w0[0]; - switch_buffer_by_offset_be_S (w0, w1, w2, w3, salt_len); - - w0[0] |= salt_buf0[0]; - w0[1] |= salt_buf0[1]; - w0[2] |= salt_buf0[2]; - w0[3] |= salt_buf0[3]; - w1[0] |= salt_buf1[0]; - w1[1] |= salt_buf1[1]; - w1[2] |= salt_buf1[2]; - w1[3] |= salt_buf1[3]; - w2[0] |= salt_buf2[0]; - w2[1] |= salt_buf2[1]; - w2[2] |= salt_buf2[2]; - w2[3] |= salt_buf2[3]; - w3[0] |= salt_buf3[0]; - w3[1] |= salt_buf3[1]; - w3[2] |= salt_buf3[2]; - w3[3] |= salt_buf3[3]; - - u32x t0[4]; - u32x t1[4]; - u32x t2[4]; - u32x t3[4]; - - t0[0] = w0[0]; - t0[1] = w0[1]; - t0[2] = w0[2]; - t0[3] = w0[3]; - t1[0] = w1[0]; - t1[1] = w1[1]; - t1[2] = w1[2]; - t1[3] = w1[3]; - t2[0] = w2[0]; - t2[1] = w2[1]; - t2[2] = w2[2]; - t2[3] = w2[3]; - t3[0] = w3[0]; - t3[1] = w3[1]; - t3[2] = w3[2]; - t3[3] = w3[3]; - - /** - * loop - */ - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { const u32x w0r = ix_create_bft (bfs_buf, il_pos); const u32x w0lr = w0l | w0r; - overwrite_at_be_4x4 (t0, t1, t2, t3, w0lr, salt_len); + u32x t0[4]; + u32x t1[4]; + u32x t2[4]; + u32x t3[4]; + + t0[0] = w0lr; + t0[1] = w0[1]; + t0[2] = w0[2]; + t0[3] = w0[3]; + t1[0] = w1[0]; + t1[1] = w1[1]; + t1[2] = w1[2]; + t1[3] = w1[3]; + t2[0] = w2[0]; + t2[1] = w2[1]; + t2[2] = w2[2]; + t2[3] = w2[3]; + t3[0] = w3[0]; + t3[1] = w3[1]; + t3[2] = w3[2]; + t3[3] = w3[3]; + + switch_buffer_by_offset_be (t0, t1, t2, t3, salt_len); + + t0[0] |= salt_buf0[0]; + t0[1] |= salt_buf0[1]; + t0[2] |= salt_buf0[2]; + t0[3] |= salt_buf0[3]; + t1[0] |= salt_buf1[0]; + t1[1] |= salt_buf1[1]; + t1[2] |= salt_buf1[2]; + t1[3] |= salt_buf1[3]; + t2[0] |= salt_buf2[0]; + t2[1] |= salt_buf2[1]; + t2[2] |= salt_buf2[2]; + t2[3] |= salt_buf2[3]; + t3[0] |= salt_buf3[0]; + t3[1] |= salt_buf3[1]; + t3[2] = 0; + t3[3] = pw_salt_len * 8; /** * sha1 diff --git a/OpenCL/m01420_a3-optimized.cl b/OpenCL/m01420_a3-optimized.cl index 21cb7c8cb..70829a8c4 100644 --- a/OpenCL/m01420_a3-optimized.cl +++ b/OpenCL/m01420_a3-optimized.cl @@ -66,63 +66,57 @@ void m01420m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl const u32 pw_salt_len = pw_len + salt_len; /** - * prepend salt + * loop */ const u32 w0l = w0[0]; - switch_buffer_by_offset_be_S (w0, w1, w2, w3, salt_len); - - w0[0] |= salt_buf0[0]; - w0[1] |= salt_buf0[1]; - w0[2] |= salt_buf0[2]; - w0[3] |= salt_buf0[3]; - w1[0] |= salt_buf1[0]; - w1[1] |= salt_buf1[1]; - w1[2] |= salt_buf1[2]; - w1[3] |= salt_buf1[3]; - w2[0] |= salt_buf2[0]; - w2[1] |= salt_buf2[1]; - w2[2] |= salt_buf2[2]; - w2[3] |= salt_buf2[3]; - w3[0] |= salt_buf3[0]; - w3[1] |= salt_buf3[1]; - w3[2] |= salt_buf3[2]; - w3[3] |= salt_buf3[3]; - - u32x t0[4]; - u32x t1[4]; - u32x t2[4]; - u32x t3[4]; - - t0[0] = w0[0]; - t0[1] = w0[1]; - t0[2] = w0[2]; - t0[3] = w0[3]; - t1[0] = w1[0]; - t1[1] = w1[1]; - t1[2] = w1[2]; - t1[3] = w1[3]; - t2[0] = w2[0]; - t2[1] = w2[1]; - t2[2] = w2[2]; - t2[3] = w2[3]; - t3[0] = w3[0]; - t3[1] = w3[1]; - t3[2] = w3[2]; - t3[3] = w3[3]; - - /** - * loop - */ - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { const u32x w0r = ix_create_bft (bfs_buf, il_pos); const u32x w0lr = w0l | w0r; - overwrite_at_be_4x4 (t0, t1, t2, t3, w0lr, salt_len); + u32x t0[4]; + u32x t1[4]; + u32x t2[4]; + u32x t3[4]; + + t0[0] = w0lr; + t0[1] = w0[1]; + t0[2] = w0[2]; + t0[3] = w0[3]; + t1[0] = w1[0]; + t1[1] = w1[1]; + t1[2] = w1[2]; + t1[3] = w1[3]; + t2[0] = w2[0]; + t2[1] = w2[1]; + t2[2] = w2[2]; + t2[3] = w2[3]; + t3[0] = w3[0]; + t3[1] = w3[1]; + t3[2] = w3[2]; + t3[3] = w3[3]; + + switch_buffer_by_offset_be (t0, t1, t2, t3, salt_len); + + t0[0] |= salt_buf0[0]; + t0[1] |= salt_buf0[1]; + t0[2] |= salt_buf0[2]; + t0[3] |= salt_buf0[3]; + t1[0] |= salt_buf1[0]; + t1[1] |= salt_buf1[1]; + t1[2] |= salt_buf1[2]; + t1[3] |= salt_buf1[3]; + t2[0] |= salt_buf2[0]; + t2[1] |= salt_buf2[1]; + t2[2] |= salt_buf2[2]; + t2[3] |= salt_buf2[3]; + t3[0] |= salt_buf3[0]; + t3[1] |= salt_buf3[1]; + t3[2] = 0; + t3[3] = pw_salt_len * 8; /** * sha256 @@ -247,6 +241,24 @@ void m01420s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digests_buf[digests_offset].digest_buf[DGST_R3] }; + /** + * reverse + */ + + u32 a_rev = digests_buf[digests_offset].digest_buf[0]; + u32 b_rev = digests_buf[digests_offset].digest_buf[1]; + u32 c_rev = digests_buf[digests_offset].digest_buf[2]; + u32 d_rev = digests_buf[digests_offset].digest_buf[3]; + u32 e_rev = digests_buf[digests_offset].digest_buf[4]; + u32 f_rev = digests_buf[digests_offset].digest_buf[5]; + u32 g_rev = digests_buf[digests_offset].digest_buf[6]; + u32 h_rev = digests_buf[digests_offset].digest_buf[7]; + + SHA256_STEP_REV (a_rev, b_rev, c_rev, d_rev, e_rev, f_rev, g_rev, h_rev); + SHA256_STEP_REV (a_rev, b_rev, c_rev, d_rev, e_rev, f_rev, g_rev, h_rev); + SHA256_STEP_REV (a_rev, b_rev, c_rev, d_rev, e_rev, f_rev, g_rev, h_rev); + SHA256_STEP_REV (a_rev, b_rev, c_rev, d_rev, e_rev, f_rev, g_rev, h_rev); + /** * salt */ @@ -278,81 +290,57 @@ void m01420s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl const u32 pw_salt_len = pw_len + salt_len; /** - * prepend salt + * loop */ const u32 w0l = w0[0]; - switch_buffer_by_offset_be_S (w0, w1, w2, w3, salt_len); - - w0[0] |= salt_buf0[0]; - w0[1] |= salt_buf0[1]; - w0[2] |= salt_buf0[2]; - w0[3] |= salt_buf0[3]; - w1[0] |= salt_buf1[0]; - w1[1] |= salt_buf1[1]; - w1[2] |= salt_buf1[2]; - w1[3] |= salt_buf1[3]; - w2[0] |= salt_buf2[0]; - w2[1] |= salt_buf2[1]; - w2[2] |= salt_buf2[2]; - w2[3] |= salt_buf2[3]; - w3[0] |= salt_buf3[0]; - w3[1] |= salt_buf3[1]; - w3[2] |= salt_buf3[2]; - w3[3] |= salt_buf3[3]; - - u32x t0[4]; - u32x t1[4]; - u32x t2[4]; - u32x t3[4]; - - t0[0] = w0[0]; - t0[1] = w0[1]; - t0[2] = w0[2]; - t0[3] = w0[3]; - t1[0] = w1[0]; - t1[1] = w1[1]; - t1[2] = w1[2]; - t1[3] = w1[3]; - t2[0] = w2[0]; - t2[1] = w2[1]; - t2[2] = w2[2]; - t2[3] = w2[3]; - t3[0] = w3[0]; - t3[1] = w3[1]; - t3[2] = w3[2]; - t3[3] = w3[3]; - - /** - * reverse - */ - - u32 a_rev = digests_buf[digests_offset].digest_buf[0]; - u32 b_rev = digests_buf[digests_offset].digest_buf[1]; - u32 c_rev = digests_buf[digests_offset].digest_buf[2]; - u32 d_rev = digests_buf[digests_offset].digest_buf[3]; - u32 e_rev = digests_buf[digests_offset].digest_buf[4]; - u32 f_rev = digests_buf[digests_offset].digest_buf[5]; - u32 g_rev = digests_buf[digests_offset].digest_buf[6]; - u32 h_rev = digests_buf[digests_offset].digest_buf[7]; - - SHA256_STEP_REV (a_rev, b_rev, c_rev, d_rev, e_rev, f_rev, g_rev, h_rev); - SHA256_STEP_REV (a_rev, b_rev, c_rev, d_rev, e_rev, f_rev, g_rev, h_rev); - SHA256_STEP_REV (a_rev, b_rev, c_rev, d_rev, e_rev, f_rev, g_rev, h_rev); - SHA256_STEP_REV (a_rev, b_rev, c_rev, d_rev, e_rev, f_rev, g_rev, h_rev); - - /** - * loop - */ - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { const u32x w0r = ix_create_bft (bfs_buf, il_pos); const u32x w0lr = w0l | w0r; - overwrite_at_be_4x4 (t0, t1, t2, t3, w0lr, salt_len); + u32x t0[4]; + u32x t1[4]; + u32x t2[4]; + u32x t3[4]; + + t0[0] = w0lr; + t0[1] = w0[1]; + t0[2] = w0[2]; + t0[3] = w0[3]; + t1[0] = w1[0]; + t1[1] = w1[1]; + t1[2] = w1[2]; + t1[3] = w1[3]; + t2[0] = w2[0]; + t2[1] = w2[1]; + t2[2] = w2[2]; + t2[3] = w2[3]; + t3[0] = w3[0]; + t3[1] = w3[1]; + t3[2] = w3[2]; + t3[3] = w3[3]; + + switch_buffer_by_offset_be (t0, t1, t2, t3, salt_len); + + t0[0] |= salt_buf0[0]; + t0[1] |= salt_buf0[1]; + t0[2] |= salt_buf0[2]; + t0[3] |= salt_buf0[3]; + t1[0] |= salt_buf1[0]; + t1[1] |= salt_buf1[1]; + t1[2] |= salt_buf1[2]; + t1[3] |= salt_buf1[3]; + t2[0] |= salt_buf2[0]; + t2[1] |= salt_buf2[1]; + t2[2] |= salt_buf2[2]; + t2[3] |= salt_buf2[3]; + t3[0] |= salt_buf3[0]; + t3[1] |= salt_buf3[1]; + t3[2] = 0; + t3[3] = pw_salt_len * 8; /** * sha256 diff --git a/OpenCL/m01440_a3-optimized.cl b/OpenCL/m01440_a3-optimized.cl index f142dfdbc..12518e0c0 100644 --- a/OpenCL/m01440_a3-optimized.cl +++ b/OpenCL/m01440_a3-optimized.cl @@ -66,63 +66,57 @@ void m01440m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl const u32 pw_salt_len = pw_len + salt_len; /** - * prepend salt + * loop */ const u32 w0l = w0[0]; - switch_buffer_by_offset_be_S (w0, w1, w2, w3, salt_len); - - w0[0] |= salt_buf0[0]; - w0[1] |= salt_buf0[1]; - w0[2] |= salt_buf0[2]; - w0[3] |= salt_buf0[3]; - w1[0] |= salt_buf1[0]; - w1[1] |= salt_buf1[1]; - w1[2] |= salt_buf1[2]; - w1[3] |= salt_buf1[3]; - w2[0] |= salt_buf2[0]; - w2[1] |= salt_buf2[1]; - w2[2] |= salt_buf2[2]; - w2[3] |= salt_buf2[3]; - w3[0] |= salt_buf3[0]; - w3[1] |= salt_buf3[1]; - w3[2] |= salt_buf3[2]; - w3[3] |= salt_buf3[3]; - - u32x t0[4]; - u32x t1[4]; - u32x t2[4]; - u32x t3[4]; - - t0[0] = w0[0]; - t0[1] = w0[1]; - t0[2] = w0[2]; - t0[3] = w0[3]; - t1[0] = w1[0]; - t1[1] = w1[1]; - t1[2] = w1[2]; - t1[3] = w1[3]; - t2[0] = w2[0]; - t2[1] = w2[1]; - t2[2] = w2[2]; - t2[3] = w2[3]; - t3[0] = w3[0]; - t3[1] = w3[1]; - t3[2] = w3[2]; - t3[3] = w3[3]; - - /** - * loop - */ - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { const u32x w0r = ix_create_bft (bfs_buf, il_pos); const u32x w0lr = w0l | w0r; - overwrite_at_be_4x4 (t0, t1, t2, t3, w0lr, salt_len); + u32x t0[4]; + u32x t1[4]; + u32x t2[4]; + u32x t3[4]; + + t0[0] = w0lr; + t0[1] = w0[1]; + t0[2] = w0[2]; + t0[3] = w0[3]; + t1[0] = w1[0]; + t1[1] = w1[1]; + t1[2] = w1[2]; + t1[3] = w1[3]; + t2[0] = w2[0]; + t2[1] = w2[1]; + t2[2] = w2[2]; + t2[3] = w2[3]; + t3[0] = w3[0]; + t3[1] = w3[1]; + t3[2] = w3[2]; + t3[3] = w3[3]; + + switch_buffer_by_offset_be (t0, t1, t2, t3, salt_len); + + t0[0] |= salt_buf0[0]; + t0[1] |= salt_buf0[1]; + t0[2] |= salt_buf0[2]; + t0[3] |= salt_buf0[3]; + t1[0] |= salt_buf1[0]; + t1[1] |= salt_buf1[1]; + t1[2] |= salt_buf1[2]; + t1[3] |= salt_buf1[3]; + t2[0] |= salt_buf2[0]; + t2[1] |= salt_buf2[1]; + t2[2] |= salt_buf2[2]; + t2[3] |= salt_buf2[3]; + t3[0] |= salt_buf3[0]; + t3[1] |= salt_buf3[1]; + t3[2] = 0; + t3[3] = pw_salt_len * 8; /** * sha256 @@ -247,6 +241,24 @@ void m01440s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digests_buf[digests_offset].digest_buf[DGST_R3] }; + /** + * reverse + */ + + u32 a_rev = digests_buf[digests_offset].digest_buf[0]; + u32 b_rev = digests_buf[digests_offset].digest_buf[1]; + u32 c_rev = digests_buf[digests_offset].digest_buf[2]; + u32 d_rev = digests_buf[digests_offset].digest_buf[3]; + u32 e_rev = digests_buf[digests_offset].digest_buf[4]; + u32 f_rev = digests_buf[digests_offset].digest_buf[5]; + u32 g_rev = digests_buf[digests_offset].digest_buf[6]; + u32 h_rev = digests_buf[digests_offset].digest_buf[7]; + + SHA256_STEP_REV (a_rev, b_rev, c_rev, d_rev, e_rev, f_rev, g_rev, h_rev); + SHA256_STEP_REV (a_rev, b_rev, c_rev, d_rev, e_rev, f_rev, g_rev, h_rev); + SHA256_STEP_REV (a_rev, b_rev, c_rev, d_rev, e_rev, f_rev, g_rev, h_rev); + SHA256_STEP_REV (a_rev, b_rev, c_rev, d_rev, e_rev, f_rev, g_rev, h_rev); + /** * salt */ @@ -278,81 +290,57 @@ void m01440s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl const u32 pw_salt_len = pw_len + salt_len; /** - * prepend salt + * loop */ const u32 w0l = w0[0]; - switch_buffer_by_offset_be_S (w0, w1, w2, w3, salt_len); - - w0[0] |= salt_buf0[0]; - w0[1] |= salt_buf0[1]; - w0[2] |= salt_buf0[2]; - w0[3] |= salt_buf0[3]; - w1[0] |= salt_buf1[0]; - w1[1] |= salt_buf1[1]; - w1[2] |= salt_buf1[2]; - w1[3] |= salt_buf1[3]; - w2[0] |= salt_buf2[0]; - w2[1] |= salt_buf2[1]; - w2[2] |= salt_buf2[2]; - w2[3] |= salt_buf2[3]; - w3[0] |= salt_buf3[0]; - w3[1] |= salt_buf3[1]; - w3[2] |= salt_buf3[2]; - w3[3] |= salt_buf3[3]; - - u32x t0[4]; - u32x t1[4]; - u32x t2[4]; - u32x t3[4]; - - t0[0] = w0[0]; - t0[1] = w0[1]; - t0[2] = w0[2]; - t0[3] = w0[3]; - t1[0] = w1[0]; - t1[1] = w1[1]; - t1[2] = w1[2]; - t1[3] = w1[3]; - t2[0] = w2[0]; - t2[1] = w2[1]; - t2[2] = w2[2]; - t2[3] = w2[3]; - t3[0] = w3[0]; - t3[1] = w3[1]; - t3[2] = w3[2]; - t3[3] = w3[3]; - - /** - * reverse - */ - - u32 a_rev = digests_buf[digests_offset].digest_buf[0]; - u32 b_rev = digests_buf[digests_offset].digest_buf[1]; - u32 c_rev = digests_buf[digests_offset].digest_buf[2]; - u32 d_rev = digests_buf[digests_offset].digest_buf[3]; - u32 e_rev = digests_buf[digests_offset].digest_buf[4]; - u32 f_rev = digests_buf[digests_offset].digest_buf[5]; - u32 g_rev = digests_buf[digests_offset].digest_buf[6]; - u32 h_rev = digests_buf[digests_offset].digest_buf[7]; - - SHA256_STEP_REV (a_rev, b_rev, c_rev, d_rev, e_rev, f_rev, g_rev, h_rev); - SHA256_STEP_REV (a_rev, b_rev, c_rev, d_rev, e_rev, f_rev, g_rev, h_rev); - SHA256_STEP_REV (a_rev, b_rev, c_rev, d_rev, e_rev, f_rev, g_rev, h_rev); - SHA256_STEP_REV (a_rev, b_rev, c_rev, d_rev, e_rev, f_rev, g_rev, h_rev); - - /** - * loop - */ - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { const u32x w0r = ix_create_bft (bfs_buf, il_pos); const u32x w0lr = w0l | w0r; - overwrite_at_be_4x4 (t0, t1, t2, t3, w0lr, salt_len); + u32x t0[4]; + u32x t1[4]; + u32x t2[4]; + u32x t3[4]; + + t0[0] = w0lr; + t0[1] = w0[1]; + t0[2] = w0[2]; + t0[3] = w0[3]; + t1[0] = w1[0]; + t1[1] = w1[1]; + t1[2] = w1[2]; + t1[3] = w1[3]; + t2[0] = w2[0]; + t2[1] = w2[1]; + t2[2] = w2[2]; + t2[3] = w2[3]; + t3[0] = w3[0]; + t3[1] = w3[1]; + t3[2] = w3[2]; + t3[3] = w3[3]; + + switch_buffer_by_offset_be (t0, t1, t2, t3, salt_len); + + t0[0] |= salt_buf0[0]; + t0[1] |= salt_buf0[1]; + t0[2] |= salt_buf0[2]; + t0[3] |= salt_buf0[3]; + t1[0] |= salt_buf1[0]; + t1[1] |= salt_buf1[1]; + t1[2] |= salt_buf1[2]; + t1[3] |= salt_buf1[3]; + t2[0] |= salt_buf2[0]; + t2[1] |= salt_buf2[1]; + t2[2] |= salt_buf2[2]; + t2[3] |= salt_buf2[3]; + t3[0] |= salt_buf3[0]; + t3[1] |= salt_buf3[1]; + t3[2] = 0; + t3[3] = pw_salt_len * 8; /** * sha256 diff --git a/OpenCL/m01720_a3-optimized.cl b/OpenCL/m01720_a3-optimized.cl index 4e57bfcf2..3ebbea0d8 100644 --- a/OpenCL/m01720_a3-optimized.cl +++ b/OpenCL/m01720_a3-optimized.cl @@ -175,90 +175,62 @@ void m01720m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl const u32 pw_salt_len = pw_len + salt_len; /** - * prepend salt + * loop */ const u32 w0l = w0[0]; - switch_buffer_by_offset_be_S (w0, w1, w2, w3, salt_len); - - w0[0] |= salt_buf0[0]; - w0[1] |= salt_buf0[1]; - w0[2] |= salt_buf0[2]; - w0[3] |= salt_buf0[3]; - w1[0] |= salt_buf1[0]; - w1[1] |= salt_buf1[1]; - w1[2] |= salt_buf1[2]; - w1[3] |= salt_buf1[3]; - w2[0] |= salt_buf2[0]; - w2[1] |= salt_buf2[1]; - w2[2] |= salt_buf2[2]; - w2[3] |= salt_buf2[3]; - w3[0] |= salt_buf3[0]; - w3[1] |= salt_buf3[1]; - w3[2] |= salt_buf3[2]; - w3[3] |= salt_buf3[3]; - - u32x t0[4]; - u32x t1[4]; - u32x t2[4]; - u32x t3[4]; - - t0[0] = w0[0]; - t0[1] = w0[1]; - t0[2] = w0[2]; - t0[3] = w0[3]; - t1[0] = w1[0]; - t1[1] = w1[1]; - t1[2] = w1[2]; - t1[3] = w1[3]; - t2[0] = w2[0]; - t2[1] = w2[1]; - t2[2] = w2[2]; - t2[3] = w2[3]; - t3[0] = w3[0]; - t3[1] = w3[1]; - t3[2] = w3[2]; - t3[3] = w3[3]; - - /** - * loop - */ - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { const u32x w0r = ix_create_bft (bfs_buf, il_pos); const u32x w0lr = w0l | w0r; - overwrite_at_be_4x4 (t0, t1, t2, t3, w0lr, salt_len); + u32x t0[4]; + u32x t1[4]; + u32x t2[4]; + u32x t3[4]; + + t0[0] = w0lr; + t0[1] = w0[1]; + t0[2] = w0[2]; + t0[3] = w0[3]; + t1[0] = w1[0]; + t1[1] = w1[1]; + t1[2] = w1[2]; + t1[3] = w1[3]; + t2[0] = w2[0]; + t2[1] = w2[1]; + t2[2] = w2[2]; + t2[3] = w2[3]; + t3[0] = w3[0]; + t3[1] = w3[1]; + t3[2] = w3[2]; + t3[3] = w3[3]; + + switch_buffer_by_offset_be (t0, t1, t2, t3, salt_len); + + t0[0] |= salt_buf0[0]; + t0[1] |= salt_buf0[1]; + t0[2] |= salt_buf0[2]; + t0[3] |= salt_buf0[3]; + t1[0] |= salt_buf1[0]; + t1[1] |= salt_buf1[1]; + t1[2] |= salt_buf1[2]; + t1[3] |= salt_buf1[3]; + t2[0] |= salt_buf2[0]; + t2[1] |= salt_buf2[1]; + t2[2] |= salt_buf2[2]; + t2[3] |= salt_buf2[3]; + t3[0] |= salt_buf3[0]; + t3[1] |= salt_buf3[1]; + t3[2] = 0; + t3[3] = pw_salt_len * 8; /** * sha512 */ - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; - - w0_t[0] = t0[0]; - w0_t[1] = t0[1]; - w0_t[2] = t0[2]; - w0_t[3] = t0[3]; - w1_t[0] = t1[0]; - w1_t[1] = t1[1]; - w1_t[2] = t1[2]; - w1_t[3] = t1[3]; - w2_t[0] = t2[0]; - w2_t[1] = t2[1]; - w2_t[2] = t2[2]; - w2_t[3] = t2[3]; - w3_t[0] = t3[0]; - w3_t[1] = t3[1]; - w3_t[2] = 0; - w3_t[3] = pw_salt_len * 8; - u64x digest[8]; digest[0] = SHA512M_A; @@ -270,7 +242,7 @@ void m01720m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digest[6] = SHA512M_G; digest[7] = SHA512M_H; - sha512_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha512_transform (t0, t1, t2, t3, digest); const u32x r0 = l32_from_64 (digest[7]); const u32x r1 = h32_from_64 (digest[7]); @@ -333,90 +305,62 @@ void m01720s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl const u32 pw_salt_len = pw_len + salt_len; /** - * prepend salt + * loop */ const u32 w0l = w0[0]; - switch_buffer_by_offset_be_S (w0, w1, w2, w3, salt_len); - - w0[0] |= salt_buf0[0]; - w0[1] |= salt_buf0[1]; - w0[2] |= salt_buf0[2]; - w0[3] |= salt_buf0[3]; - w1[0] |= salt_buf1[0]; - w1[1] |= salt_buf1[1]; - w1[2] |= salt_buf1[2]; - w1[3] |= salt_buf1[3]; - w2[0] |= salt_buf2[0]; - w2[1] |= salt_buf2[1]; - w2[2] |= salt_buf2[2]; - w2[3] |= salt_buf2[3]; - w3[0] |= salt_buf3[0]; - w3[1] |= salt_buf3[1]; - w3[2] |= salt_buf3[2]; - w3[3] |= salt_buf3[3]; - - u32x t0[4]; - u32x t1[4]; - u32x t2[4]; - u32x t3[4]; - - t0[0] = w0[0]; - t0[1] = w0[1]; - t0[2] = w0[2]; - t0[3] = w0[3]; - t1[0] = w1[0]; - t1[1] = w1[1]; - t1[2] = w1[2]; - t1[3] = w1[3]; - t2[0] = w2[0]; - t2[1] = w2[1]; - t2[2] = w2[2]; - t2[3] = w2[3]; - t3[0] = w3[0]; - t3[1] = w3[1]; - t3[2] = w3[2]; - t3[3] = w3[3]; - - /** - * loop - */ - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { const u32x w0r = ix_create_bft (bfs_buf, il_pos); const u32x w0lr = w0l | w0r; - overwrite_at_be_4x4 (t0, t1, t2, t3, w0lr, salt_len); + u32x t0[4]; + u32x t1[4]; + u32x t2[4]; + u32x t3[4]; + + t0[0] = w0lr; + t0[1] = w0[1]; + t0[2] = w0[2]; + t0[3] = w0[3]; + t1[0] = w1[0]; + t1[1] = w1[1]; + t1[2] = w1[2]; + t1[3] = w1[3]; + t2[0] = w2[0]; + t2[1] = w2[1]; + t2[2] = w2[2]; + t2[3] = w2[3]; + t3[0] = w3[0]; + t3[1] = w3[1]; + t3[2] = w3[2]; + t3[3] = w3[3]; + + switch_buffer_by_offset_be (t0, t1, t2, t3, salt_len); + + t0[0] |= salt_buf0[0]; + t0[1] |= salt_buf0[1]; + t0[2] |= salt_buf0[2]; + t0[3] |= salt_buf0[3]; + t1[0] |= salt_buf1[0]; + t1[1] |= salt_buf1[1]; + t1[2] |= salt_buf1[2]; + t1[3] |= salt_buf1[3]; + t2[0] |= salt_buf2[0]; + t2[1] |= salt_buf2[1]; + t2[2] |= salt_buf2[2]; + t2[3] |= salt_buf2[3]; + t3[0] |= salt_buf3[0]; + t3[1] |= salt_buf3[1]; + t3[2] = 0; + t3[3] = pw_salt_len * 8; /** * sha512 */ - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; - - w0_t[0] = t0[0]; - w0_t[1] = t0[1]; - w0_t[2] = t0[2]; - w0_t[3] = t0[3]; - w1_t[0] = t1[0]; - w1_t[1] = t1[1]; - w1_t[2] = t1[2]; - w1_t[3] = t1[3]; - w2_t[0] = t2[0]; - w2_t[1] = t2[1]; - w2_t[2] = t2[2]; - w2_t[3] = t2[3]; - w3_t[0] = t3[0]; - w3_t[1] = t3[1]; - w3_t[2] = 0; - w3_t[3] = pw_salt_len * 8; - u64x digest[8]; digest[0] = SHA512M_A; @@ -428,7 +372,7 @@ void m01720s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digest[6] = SHA512M_G; digest[7] = SHA512M_H; - sha512_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha512_transform (t0, t1, t2, t3, digest); const u32x r0 = l32_from_64 (digest[7]); const u32x r1 = h32_from_64 (digest[7]); diff --git a/OpenCL/m01740_a3-optimized.cl b/OpenCL/m01740_a3-optimized.cl index e78a02ae0..f1811cc7c 100644 --- a/OpenCL/m01740_a3-optimized.cl +++ b/OpenCL/m01740_a3-optimized.cl @@ -175,90 +175,62 @@ void m01740m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl const u32 pw_salt_len = pw_len + salt_len; /** - * prepend salt + * loop */ const u32 w0l = w0[0]; - switch_buffer_by_offset_be_S (w0, w1, w2, w3, salt_len); - - w0[0] |= salt_buf0[0]; - w0[1] |= salt_buf0[1]; - w0[2] |= salt_buf0[2]; - w0[3] |= salt_buf0[3]; - w1[0] |= salt_buf1[0]; - w1[1] |= salt_buf1[1]; - w1[2] |= salt_buf1[2]; - w1[3] |= salt_buf1[3]; - w2[0] |= salt_buf2[0]; - w2[1] |= salt_buf2[1]; - w2[2] |= salt_buf2[2]; - w2[3] |= salt_buf2[3]; - w3[0] |= salt_buf3[0]; - w3[1] |= salt_buf3[1]; - w3[2] |= salt_buf3[2]; - w3[3] |= salt_buf3[3]; - - u32x t0[4]; - u32x t1[4]; - u32x t2[4]; - u32x t3[4]; - - t0[0] = w0[0]; - t0[1] = w0[1]; - t0[2] = w0[2]; - t0[3] = w0[3]; - t1[0] = w1[0]; - t1[1] = w1[1]; - t1[2] = w1[2]; - t1[3] = w1[3]; - t2[0] = w2[0]; - t2[1] = w2[1]; - t2[2] = w2[2]; - t2[3] = w2[3]; - t3[0] = w3[0]; - t3[1] = w3[1]; - t3[2] = w3[2]; - t3[3] = w3[3]; - - /** - * loop - */ - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { const u32x w0r = ix_create_bft (bfs_buf, il_pos); const u32x w0lr = w0l | w0r; - overwrite_at_be_4x4 (t0, t1, t2, t3, w0lr, salt_len); + u32x t0[4]; + u32x t1[4]; + u32x t2[4]; + u32x t3[4]; + + t0[0] = w0lr; + t0[1] = w0[1]; + t0[2] = w0[2]; + t0[3] = w0[3]; + t1[0] = w1[0]; + t1[1] = w1[1]; + t1[2] = w1[2]; + t1[3] = w1[3]; + t2[0] = w2[0]; + t2[1] = w2[1]; + t2[2] = w2[2]; + t2[3] = w2[3]; + t3[0] = w3[0]; + t3[1] = w3[1]; + t3[2] = w3[2]; + t3[3] = w3[3]; + + switch_buffer_by_offset_be (t0, t1, t2, t3, salt_len); + + t0[0] |= salt_buf0[0]; + t0[1] |= salt_buf0[1]; + t0[2] |= salt_buf0[2]; + t0[3] |= salt_buf0[3]; + t1[0] |= salt_buf1[0]; + t1[1] |= salt_buf1[1]; + t1[2] |= salt_buf1[2]; + t1[3] |= salt_buf1[3]; + t2[0] |= salt_buf2[0]; + t2[1] |= salt_buf2[1]; + t2[2] |= salt_buf2[2]; + t2[3] |= salt_buf2[3]; + t3[0] |= salt_buf3[0]; + t3[1] |= salt_buf3[1]; + t3[2] = 0; + t3[3] = pw_salt_len * 8; /** * sha512 */ - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; - - w0_t[0] = t0[0]; - w0_t[1] = t0[1]; - w0_t[2] = t0[2]; - w0_t[3] = t0[3]; - w1_t[0] = t1[0]; - w1_t[1] = t1[1]; - w1_t[2] = t1[2]; - w1_t[3] = t1[3]; - w2_t[0] = t2[0]; - w2_t[1] = t2[1]; - w2_t[2] = t2[2]; - w2_t[3] = t2[3]; - w3_t[0] = t3[0]; - w3_t[1] = t3[1]; - w3_t[2] = 0; - w3_t[3] = pw_salt_len * 8; - u64x digest[8]; digest[0] = SHA512M_A; @@ -270,7 +242,7 @@ void m01740m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digest[6] = SHA512M_G; digest[7] = SHA512M_H; - sha512_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha512_transform (t0, t1, t2, t3, digest); const u32x r0 = l32_from_64 (digest[7]); const u32x r1 = h32_from_64 (digest[7]); @@ -333,90 +305,62 @@ void m01740s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl const u32 pw_salt_len = pw_len + salt_len; /** - * prepend salt + * loop */ const u32 w0l = w0[0]; - switch_buffer_by_offset_be_S (w0, w1, w2, w3, salt_len); - - w0[0] |= salt_buf0[0]; - w0[1] |= salt_buf0[1]; - w0[2] |= salt_buf0[2]; - w0[3] |= salt_buf0[3]; - w1[0] |= salt_buf1[0]; - w1[1] |= salt_buf1[1]; - w1[2] |= salt_buf1[2]; - w1[3] |= salt_buf1[3]; - w2[0] |= salt_buf2[0]; - w2[1] |= salt_buf2[1]; - w2[2] |= salt_buf2[2]; - w2[3] |= salt_buf2[3]; - w3[0] |= salt_buf3[0]; - w3[1] |= salt_buf3[1]; - w3[2] |= salt_buf3[2]; - w3[3] |= salt_buf3[3]; - - u32x t0[4]; - u32x t1[4]; - u32x t2[4]; - u32x t3[4]; - - t0[0] = w0[0]; - t0[1] = w0[1]; - t0[2] = w0[2]; - t0[3] = w0[3]; - t1[0] = w1[0]; - t1[1] = w1[1]; - t1[2] = w1[2]; - t1[3] = w1[3]; - t2[0] = w2[0]; - t2[1] = w2[1]; - t2[2] = w2[2]; - t2[3] = w2[3]; - t3[0] = w3[0]; - t3[1] = w3[1]; - t3[2] = w3[2]; - t3[3] = w3[3]; - - /** - * loop - */ - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { const u32x w0r = ix_create_bft (bfs_buf, il_pos); const u32x w0lr = w0l | w0r; - overwrite_at_be_4x4 (t0, t1, t2, t3, w0lr, salt_len); + u32x t0[4]; + u32x t1[4]; + u32x t2[4]; + u32x t3[4]; + + t0[0] = w0lr; + t0[1] = w0[1]; + t0[2] = w0[2]; + t0[3] = w0[3]; + t1[0] = w1[0]; + t1[1] = w1[1]; + t1[2] = w1[2]; + t1[3] = w1[3]; + t2[0] = w2[0]; + t2[1] = w2[1]; + t2[2] = w2[2]; + t2[3] = w2[3]; + t3[0] = w3[0]; + t3[1] = w3[1]; + t3[2] = w3[2]; + t3[3] = w3[3]; + + switch_buffer_by_offset_be (t0, t1, t2, t3, salt_len); + + t0[0] |= salt_buf0[0]; + t0[1] |= salt_buf0[1]; + t0[2] |= salt_buf0[2]; + t0[3] |= salt_buf0[3]; + t1[0] |= salt_buf1[0]; + t1[1] |= salt_buf1[1]; + t1[2] |= salt_buf1[2]; + t1[3] |= salt_buf1[3]; + t2[0] |= salt_buf2[0]; + t2[1] |= salt_buf2[1]; + t2[2] |= salt_buf2[2]; + t2[3] |= salt_buf2[3]; + t3[0] |= salt_buf3[0]; + t3[1] |= salt_buf3[1]; + t3[2] = 0; + t3[3] = pw_salt_len * 8; /** * sha512 */ - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; - - w0_t[0] = t0[0]; - w0_t[1] = t0[1]; - w0_t[2] = t0[2]; - w0_t[3] = t0[3]; - w1_t[0] = t1[0]; - w1_t[1] = t1[1]; - w1_t[2] = t1[2]; - w1_t[3] = t1[3]; - w2_t[0] = t2[0]; - w2_t[1] = t2[1]; - w2_t[2] = t2[2]; - w2_t[3] = t2[3]; - w3_t[0] = t3[0]; - w3_t[1] = t3[1]; - w3_t[2] = 0; - w3_t[3] = pw_salt_len * 8; - u64x digest[8]; digest[0] = SHA512M_A; @@ -428,7 +372,7 @@ void m01740s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digest[6] = SHA512M_G; digest[7] = SHA512M_H; - sha512_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha512_transform (t0, t1, t2, t3, digest); const u32x r0 = l32_from_64 (digest[7]); const u32x r1 = h32_from_64 (digest[7]); diff --git a/OpenCL/m03100_a3-optimized.cl b/OpenCL/m03100_a3-optimized.cl index 54460d8b7..77a4c7e9c 100644 --- a/OpenCL/m03100_a3-optimized.cl +++ b/OpenCL/m03100_a3-optimized.cl @@ -513,62 +513,6 @@ void m03100m (__local u32 (*s_SPtrans)[64], __local u32 (*s_skb)[64], u32 w[16], const u32 salt_word_len = (salt_len + pw_len) * 2; - /** - * prepend salt - */ - - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; - - w0_t[0] = w[ 0]; - w0_t[1] = w[ 1]; - w0_t[2] = w[ 2]; - w0_t[3] = w[ 3]; - w1_t[0] = w[ 4]; - w1_t[1] = w[ 5]; - w1_t[2] = w[ 6]; - w1_t[3] = w[ 7]; - w2_t[0] = w[ 8]; - w2_t[1] = w[ 9]; - w2_t[2] = w[10]; - w2_t[3] = w[11]; - w3_t[0] = w[12]; - w3_t[1] = w[13]; - w3_t[2] = w[14]; - w3_t[3] = w[15]; - - switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len); - - w0_t[0] |= salt_buf0[0]; - w0_t[1] |= salt_buf0[1]; - w0_t[2] |= salt_buf0[2]; - w0_t[3] |= salt_buf0[3]; - w1_t[0] |= salt_buf1[0]; - w1_t[1] |= salt_buf1[1]; - w1_t[2] |= salt_buf1[2]; - w1_t[3] |= salt_buf1[3]; - - u32x dst[16]; - - dst[ 0] = w0_t[0]; - dst[ 1] = w0_t[1]; - dst[ 2] = w0_t[2]; - dst[ 3] = w0_t[3]; - dst[ 4] = w1_t[0]; - dst[ 5] = w1_t[1]; - dst[ 6] = w1_t[2]; - dst[ 7] = w1_t[3]; - dst[ 8] = w2_t[0]; - dst[ 9] = w2_t[1]; - dst[10] = w2_t[2]; - dst[11] = w2_t[3]; - dst[12] = w3_t[0]; - dst[13] = w3_t[1]; - dst[14] = w3_t[2]; - dst[15] = w3_t[3]; - /** * loop */ @@ -579,9 +523,50 @@ void m03100m (__local u32 (*s_SPtrans)[64], __local u32 (*s_skb)[64], u32 w[16], { const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32x w0 = w0l | w0r; + const u32x w0lr = w0l | w0r; - overwrite_at_le (dst, w0, salt_len); + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = w0lr; + w0[1] = w[ 1]; + w0[2] = w[ 2]; + w0[3] = w[ 3]; + w1[0] = w[ 4]; + w1[1] = w[ 5]; + w1[2] = w[ 6]; + w1[3] = w[ 7]; + w2[0] = w[ 8]; + w2[1] = w[ 9]; + w2[2] = w[10]; + w2[3] = w[11]; + w3[0] = w[12]; + w3[1] = w[13]; + w3[2] = w[14]; + w3[3] = w[15]; + + switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len); + + u32x dst[16]; + + dst[ 0] = w0[0] | salt_buf0[0]; + dst[ 1] = w0[1] | salt_buf0[1]; + dst[ 2] = w0[2] | salt_buf0[2]; + dst[ 3] = w0[3] | salt_buf0[3]; + dst[ 4] = w1[0] | salt_buf1[0]; + dst[ 5] = w1[1] | salt_buf1[1]; + dst[ 6] = w1[2] | salt_buf1[2]; + dst[ 7] = w1[3] | salt_buf1[3]; + dst[ 8] = w2[0]; + dst[ 9] = w2[1]; + dst[10] = w2[2]; + dst[11] = w2[3]; + dst[12] = w3[0]; + dst[13] = w3[1]; + dst[14] = w3[2]; + dst[15] = w3[3]; /** * precompute key1 since key is static: 0x0123456789abcdef @@ -709,62 +694,6 @@ void m03100s (__local u32 (*s_SPtrans)[64], __local u32 (*s_skb)[64], u32 w[16], const u32 salt_word_len = (salt_len + pw_len) * 2; - /** - * prepend salt - */ - - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; - - w0_t[0] = w[ 0]; - w0_t[1] = w[ 1]; - w0_t[2] = w[ 2]; - w0_t[3] = w[ 3]; - w1_t[0] = w[ 4]; - w1_t[1] = w[ 5]; - w1_t[2] = w[ 6]; - w1_t[3] = w[ 7]; - w2_t[0] = w[ 8]; - w2_t[1] = w[ 9]; - w2_t[2] = w[10]; - w2_t[3] = w[11]; - w3_t[0] = w[12]; - w3_t[1] = w[13]; - w3_t[2] = w[14]; - w3_t[3] = w[15]; - - switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len); - - w0_t[0] |= salt_buf0[0]; - w0_t[1] |= salt_buf0[1]; - w0_t[2] |= salt_buf0[2]; - w0_t[3] |= salt_buf0[3]; - w1_t[0] |= salt_buf1[0]; - w1_t[1] |= salt_buf1[1]; - w1_t[2] |= salt_buf1[2]; - w1_t[3] |= salt_buf1[3]; - - u32x dst[16]; - - dst[ 0] = w0_t[0]; - dst[ 1] = w0_t[1]; - dst[ 2] = w0_t[2]; - dst[ 3] = w0_t[3]; - dst[ 4] = w1_t[0]; - dst[ 5] = w1_t[1]; - dst[ 6] = w1_t[2]; - dst[ 7] = w1_t[3]; - dst[ 8] = w2_t[0]; - dst[ 9] = w2_t[1]; - dst[10] = w2_t[2]; - dst[11] = w2_t[3]; - dst[12] = w3_t[0]; - dst[13] = w3_t[1]; - dst[14] = w3_t[2]; - dst[15] = w3_t[3]; - /** * digest */ @@ -787,9 +716,50 @@ void m03100s (__local u32 (*s_SPtrans)[64], __local u32 (*s_skb)[64], u32 w[16], { const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32x w0 = w0l | w0r; + const u32x w0lr = w0l | w0r; - overwrite_at_le (dst, w0, salt_len); + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = w0lr; + w0[1] = w[ 1]; + w0[2] = w[ 2]; + w0[3] = w[ 3]; + w1[0] = w[ 4]; + w1[1] = w[ 5]; + w1[2] = w[ 6]; + w1[3] = w[ 7]; + w2[0] = w[ 8]; + w2[1] = w[ 9]; + w2[2] = w[10]; + w2[3] = w[11]; + w3[0] = w[12]; + w3[1] = w[13]; + w3[2] = w[14]; + w3[3] = w[15]; + + switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len); + + u32x dst[16]; + + dst[ 0] = w0[0] | salt_buf0[0]; + dst[ 1] = w0[1] | salt_buf0[1]; + dst[ 2] = w0[2] | salt_buf0[2]; + dst[ 3] = w0[3] | salt_buf0[3]; + dst[ 4] = w1[0] | salt_buf1[0]; + dst[ 5] = w1[1] | salt_buf1[1]; + dst[ 6] = w1[2] | salt_buf1[2]; + dst[ 7] = w1[3] | salt_buf1[3]; + dst[ 8] = w2[0]; + dst[ 9] = w2[1]; + dst[10] = w2[2]; + dst[11] = w2[3]; + dst[12] = w3[0]; + dst[13] = w3[1]; + dst[14] = w3[2]; + dst[15] = w3[3]; /** * precompute key1 since key is static: 0x0123456789abcdef diff --git a/OpenCL/m04010_a3-optimized.cl b/OpenCL/m04010_a3-optimized.cl index 54c2faca8..ff7abfe05 100644 --- a/OpenCL/m04010_a3-optimized.cl +++ b/OpenCL/m04010_a3-optimized.cl @@ -65,85 +65,57 @@ void m04010m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl const u32 final_len = salt_len + 32; /** - * prepend salt + * loop */ const u32 w0l = w0[0]; - switch_buffer_by_offset_le_S (w0, w1, w2, w3, salt_len); - - w0[0] |= salt_buf0[0]; - w0[1] |= salt_buf0[1]; - w0[2] |= salt_buf0[2]; - w0[3] |= salt_buf0[3]; - w1[0] |= salt_buf1[0]; - w1[1] |= salt_buf1[1]; - w1[2] |= salt_buf1[2]; - w1[3] |= salt_buf1[3]; - w2[0] |= salt_buf2[0]; - w2[1] |= salt_buf2[1]; - w2[2] |= salt_buf2[2]; - w2[3] |= salt_buf2[3]; - w3[0] |= salt_buf3[0]; - w3[1] |= salt_buf3[1]; - w3[2] |= salt_buf3[2]; - w3[3] |= salt_buf3[3]; - - u32x t0[4]; - u32x t1[4]; - u32x t2[4]; - u32x t3[4]; - - t0[0] = w0[0]; - t0[1] = w0[1]; - t0[2] = w0[2]; - t0[3] = w0[3]; - t1[0] = w1[0]; - t1[1] = w1[1]; - t1[2] = w1[2]; - t1[3] = w1[3]; - t2[0] = w2[0]; - t2[1] = w2[1]; - t2[2] = w2[2]; - t2[3] = w2[3]; - t3[0] = w3[0]; - t3[1] = w3[1]; - t3[2] = pw_salt_len * 8; - t3[3] = 0; - - /** - * loop - */ - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { const u32x w0r = ix_create_bft (bfs_buf, il_pos); const u32x w0lr = w0l | w0r; - overwrite_at_le_4x4 (t0, t1, t2, t3, w0lr, salt_len); - u32x w0_t[4]; u32x w1_t[4]; u32x w2_t[4]; u32x w3_t[4]; - w0_t[0] = t0[0]; - w0_t[1] = t0[1]; - w0_t[2] = t0[2]; - w0_t[3] = t0[3]; - w1_t[0] = t1[0]; - w1_t[1] = t1[1]; - w1_t[2] = t1[2]; - w1_t[3] = t1[3]; - w2_t[0] = t2[0]; - w2_t[1] = t2[1]; - w2_t[2] = t2[2]; - w2_t[3] = t2[3]; - w3_t[0] = t3[0]; - w3_t[1] = t3[1]; - w3_t[2] = t3[2]; - w3_t[3] = t3[3]; + w0_t[0] = w0lr; + w0_t[1] = w0[1]; + w0_t[2] = w0[2]; + w0_t[3] = w0[3]; + w1_t[0] = w1[0]; + w1_t[1] = w1[1]; + w1_t[2] = w1[2]; + w1_t[3] = w1[3]; + w2_t[0] = w2[0]; + w2_t[1] = w2[1]; + w2_t[2] = w2[2]; + w2_t[3] = w2[3]; + w3_t[0] = w3[0]; + w3_t[1] = w3[1]; + w3_t[2] = w3[2]; + w3_t[3] = w3[3]; + + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); + + w0_t[0] |= salt_buf0[0]; + w0_t[1] |= salt_buf0[1]; + w0_t[2] |= salt_buf0[2]; + w0_t[3] |= salt_buf0[3]; + w1_t[0] |= salt_buf1[0]; + w1_t[1] |= salt_buf1[1]; + w1_t[2] |= salt_buf1[2]; + w1_t[3] |= salt_buf1[3]; + w2_t[0] |= salt_buf2[0]; + w2_t[1] |= salt_buf2[1]; + w2_t[2] |= salt_buf2[2]; + w2_t[3] |= salt_buf2[3]; + w3_t[0] |= salt_buf3[0]; + w3_t[1] |= salt_buf3[1]; + w3_t[2] = pw_salt_len * 8; + w3_t[3] = 0; /** * md5 @@ -412,85 +384,57 @@ void m04010s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl const u32 final_len = salt_len + 32; /** - * prepend salt + * loop */ const u32 w0l = w0[0]; - switch_buffer_by_offset_le_S (w0, w1, w2, w3, salt_len); - - w0[0] |= salt_buf0[0]; - w0[1] |= salt_buf0[1]; - w0[2] |= salt_buf0[2]; - w0[3] |= salt_buf0[3]; - w1[0] |= salt_buf1[0]; - w1[1] |= salt_buf1[1]; - w1[2] |= salt_buf1[2]; - w1[3] |= salt_buf1[3]; - w2[0] |= salt_buf2[0]; - w2[1] |= salt_buf2[1]; - w2[2] |= salt_buf2[2]; - w2[3] |= salt_buf2[3]; - w3[0] |= salt_buf3[0]; - w3[1] |= salt_buf3[1]; - w3[2] |= salt_buf3[2]; - w3[3] |= salt_buf3[3]; - - u32x t0[4]; - u32x t1[4]; - u32x t2[4]; - u32x t3[4]; - - t0[0] = w0[0]; - t0[1] = w0[1]; - t0[2] = w0[2]; - t0[3] = w0[3]; - t1[0] = w1[0]; - t1[1] = w1[1]; - t1[2] = w1[2]; - t1[3] = w1[3]; - t2[0] = w2[0]; - t2[1] = w2[1]; - t2[2] = w2[2]; - t2[3] = w2[3]; - t3[0] = w3[0]; - t3[1] = w3[1]; - t3[2] = pw_salt_len * 8; - t3[3] = 0; - - /** - * loop - */ - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { const u32x w0r = ix_create_bft (bfs_buf, il_pos); const u32x w0lr = w0l | w0r; - overwrite_at_le_4x4 (t0, t1, t2, t3, w0lr, salt_len); - u32x w0_t[4]; u32x w1_t[4]; u32x w2_t[4]; u32x w3_t[4]; - w0_t[0] = t0[0]; - w0_t[1] = t0[1]; - w0_t[2] = t0[2]; - w0_t[3] = t0[3]; - w1_t[0] = t1[0]; - w1_t[1] = t1[1]; - w1_t[2] = t1[2]; - w1_t[3] = t1[3]; - w2_t[0] = t2[0]; - w2_t[1] = t2[1]; - w2_t[2] = t2[2]; - w2_t[3] = t2[3]; - w3_t[0] = t3[0]; - w3_t[1] = t3[1]; - w3_t[2] = t3[2]; - w3_t[3] = t3[3]; + w0_t[0] = w0lr; + w0_t[1] = w0[1]; + w0_t[2] = w0[2]; + w0_t[3] = w0[3]; + w1_t[0] = w1[0]; + w1_t[1] = w1[1]; + w1_t[2] = w1[2]; + w1_t[3] = w1[3]; + w2_t[0] = w2[0]; + w2_t[1] = w2[1]; + w2_t[2] = w2[2]; + w2_t[3] = w2[3]; + w3_t[0] = w3[0]; + w3_t[1] = w3[1]; + w3_t[2] = w3[2]; + w3_t[3] = w3[3]; + + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); + + w0_t[0] |= salt_buf0[0]; + w0_t[1] |= salt_buf0[1]; + w0_t[2] |= salt_buf0[2]; + w0_t[3] |= salt_buf0[3]; + w1_t[0] |= salt_buf1[0]; + w1_t[1] |= salt_buf1[1]; + w1_t[2] |= salt_buf1[2]; + w1_t[3] |= salt_buf1[3]; + w2_t[0] |= salt_buf2[0]; + w2_t[1] |= salt_buf2[1]; + w2_t[2] |= salt_buf2[2]; + w2_t[3] |= salt_buf2[3]; + w3_t[0] |= salt_buf3[0]; + w3_t[1] |= salt_buf3[1]; + w3_t[2] = pw_salt_len * 8; + w3_t[3] = 0; /** * md5