From 4e97a4db246c3b5bd746effd7613858d2abbb081 Mon Sep 17 00:00:00 2001 From: jsteube Date: Thu, 20 Jul 2017 17:38:43 +0200 Subject: [PATCH 01/75] Add pure kernels for md5(sha1($pass)) --- OpenCL/m04400_a0.cl | 253 ++++++++++++++++++++++++++++++++++++++++++ OpenCL/m04400_a1.cl | 229 ++++++++++++++++++++++++++++++++++++++ OpenCL/m04400_a3.cl | 263 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 745 insertions(+) create mode 100644 OpenCL/m04400_a0.cl create mode 100644 OpenCL/m04400_a1.cl create mode 100644 OpenCL/m04400_a3.cl diff --git a/OpenCL/m04400_a0.cl b/OpenCL/m04400_a0.cl new file mode 100644 index 000000000..877fee359 --- /dev/null +++ b/OpenCL/m04400_a0.cl @@ -0,0 +1,253 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m04400_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_swap (&ctx0, w, pw_len); + + sha1_final (&ctx0); + + const u32 a = ctx0.h[0]; + const u32 b = ctx0.h[1]; + const u32 c = ctx0.h[2]; + const u32 d = ctx0.h[3]; + const u32 e = ctx0.h[4]; + + md5_ctx_t ctx; + + md5_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 24) & 255) << 0 + | uint_to_hex_lower8 ((a >> 16) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 8) & 255) << 0 + | uint_to_hex_lower8 ((a >> 0) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 24) & 255) << 0 + | uint_to_hex_lower8 ((b >> 16) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 8) & 255) << 0 + | uint_to_hex_lower8 ((b >> 0) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 24) & 255) << 0 + | uint_to_hex_lower8 ((c >> 16) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 8) & 255) << 0 + | uint_to_hex_lower8 ((c >> 0) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 24) & 255) << 0 + | uint_to_hex_lower8 ((d >> 16) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 8) & 255) << 0 + | uint_to_hex_lower8 ((d >> 0) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8 ((e >> 24) & 255) << 0 + | uint_to_hex_lower8 ((e >> 16) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8 ((e >> 8) & 255) << 0 + | uint_to_hex_lower8 ((e >> 0) & 255) << 16; + + ctx.len = 40; + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m04400_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_swap (&ctx0, w, pw_len); + + sha1_final (&ctx0); + + const u32 a = ctx0.h[0]; + const u32 b = ctx0.h[1]; + const u32 c = ctx0.h[2]; + const u32 d = ctx0.h[3]; + const u32 e = ctx0.h[4]; + + md5_ctx_t ctx; + + md5_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 24) & 255) << 0 + | uint_to_hex_lower8 ((a >> 16) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 8) & 255) << 0 + | uint_to_hex_lower8 ((a >> 0) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 24) & 255) << 0 + | uint_to_hex_lower8 ((b >> 16) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 8) & 255) << 0 + | uint_to_hex_lower8 ((b >> 0) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 24) & 255) << 0 + | uint_to_hex_lower8 ((c >> 16) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 8) & 255) << 0 + | uint_to_hex_lower8 ((c >> 0) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 24) & 255) << 0 + | uint_to_hex_lower8 ((d >> 16) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 8) & 255) << 0 + | uint_to_hex_lower8 ((d >> 0) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8 ((e >> 24) & 255) << 0 + | uint_to_hex_lower8 ((e >> 16) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8 ((e >> 8) & 255) << 0 + | uint_to_hex_lower8 ((e >> 0) & 255) << 16; + + ctx.len = 40; + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m04400_a1.cl b/OpenCL/m04400_a1.cl new file mode 100644 index 000000000..190138933 --- /dev/null +++ b/OpenCL/m04400_a1.cl @@ -0,0 +1,229 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m04400_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx1 = ctx0; + + sha1_update_global_swap (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + const u32 e = ctx1.h[4]; + + md5_ctx_t ctx; + + md5_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 24) & 255) << 0 + | uint_to_hex_lower8 ((a >> 16) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 8) & 255) << 0 + | uint_to_hex_lower8 ((a >> 0) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 24) & 255) << 0 + | uint_to_hex_lower8 ((b >> 16) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 8) & 255) << 0 + | uint_to_hex_lower8 ((b >> 0) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 24) & 255) << 0 + | uint_to_hex_lower8 ((c >> 16) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 8) & 255) << 0 + | uint_to_hex_lower8 ((c >> 0) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 24) & 255) << 0 + | uint_to_hex_lower8 ((d >> 16) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 8) & 255) << 0 + | uint_to_hex_lower8 ((d >> 0) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8 ((e >> 24) & 255) << 0 + | uint_to_hex_lower8 ((e >> 16) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8 ((e >> 8) & 255) << 0 + | uint_to_hex_lower8 ((e >> 0) & 255) << 16; + + ctx.len = 40; + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m04400_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx1 = ctx0; + + sha1_update_global_swap (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + const u32 e = ctx1.h[4]; + + md5_ctx_t ctx; + + md5_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 24) & 255) << 0 + | uint_to_hex_lower8 ((a >> 16) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 8) & 255) << 0 + | uint_to_hex_lower8 ((a >> 0) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 24) & 255) << 0 + | uint_to_hex_lower8 ((b >> 16) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 8) & 255) << 0 + | uint_to_hex_lower8 ((b >> 0) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 24) & 255) << 0 + | uint_to_hex_lower8 ((c >> 16) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 8) & 255) << 0 + | uint_to_hex_lower8 ((c >> 0) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 24) & 255) << 0 + | uint_to_hex_lower8 ((d >> 16) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 8) & 255) << 0 + | uint_to_hex_lower8 ((d >> 0) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8 ((e >> 24) & 255) << 0 + | uint_to_hex_lower8 ((e >> 16) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8 ((e >> 8) & 255) << 0 + | uint_to_hex_lower8 ((e >> 0) & 255) << 16; + + ctx.len = 40; + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m04400_a3.cl b/OpenCL/m04400_a3.cl new file mode 100644 index 000000000..598de7915 --- /dev/null +++ b/OpenCL/m04400_a3.cl @@ -0,0 +1,263 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_md5.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m04400_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx0; + + sha1_init_vector (&ctx0); + + sha1_update_vector (&ctx0, w, pw_len); + + sha1_final_vector (&ctx0); + + const u32x a = ctx0.h[0]; + const u32x b = ctx0.h[1]; + const u32x c = ctx0.h[2]; + const u32x d = ctx0.h[3]; + const u32x e = ctx0.h[4]; + + md5_ctx_vector_t ctx; + + md5_init_vector (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 24) & 255) << 0 + | uint_to_hex_lower8 ((a >> 16) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 8) & 255) << 0 + | uint_to_hex_lower8 ((a >> 0) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 24) & 255) << 0 + | uint_to_hex_lower8 ((b >> 16) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 8) & 255) << 0 + | uint_to_hex_lower8 ((b >> 0) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 24) & 255) << 0 + | uint_to_hex_lower8 ((c >> 16) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 8) & 255) << 0 + | uint_to_hex_lower8 ((c >> 0) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 24) & 255) << 0 + | uint_to_hex_lower8 ((d >> 16) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 8) & 255) << 0 + | uint_to_hex_lower8 ((d >> 0) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8 ((e >> 24) & 255) << 0 + | uint_to_hex_lower8 ((e >> 16) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8 ((e >> 8) & 255) << 0 + | uint_to_hex_lower8 ((e >> 0) & 255) << 16; + + ctx.len = 40; + + md5_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m04400_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx0; + + sha1_init_vector (&ctx0); + + sha1_update_vector (&ctx0, w, pw_len); + + sha1_final_vector (&ctx0); + + const u32x a = ctx0.h[0]; + const u32x b = ctx0.h[1]; + const u32x c = ctx0.h[2]; + const u32x d = ctx0.h[3]; + const u32x e = ctx0.h[4]; + + md5_ctx_vector_t ctx; + + md5_init_vector (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 24) & 255) << 0 + | uint_to_hex_lower8 ((a >> 16) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 8) & 255) << 0 + | uint_to_hex_lower8 ((a >> 0) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 24) & 255) << 0 + | uint_to_hex_lower8 ((b >> 16) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 8) & 255) << 0 + | uint_to_hex_lower8 ((b >> 0) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 24) & 255) << 0 + | uint_to_hex_lower8 ((c >> 16) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 8) & 255) << 0 + | uint_to_hex_lower8 ((c >> 0) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 24) & 255) << 0 + | uint_to_hex_lower8 ((d >> 16) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 8) & 255) << 0 + | uint_to_hex_lower8 ((d >> 0) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8 ((e >> 24) & 255) << 0 + | uint_to_hex_lower8 ((e >> 16) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8 ((e >> 8) & 255) << 0 + | uint_to_hex_lower8 ((e >> 0) & 255) << 16; + + ctx.len = 40; + + md5_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} From 15d725e6cc320a09fe454de7d504b4bd3b7c400b Mon Sep 17 00:00:00 2001 From: jsteube Date: Thu, 20 Jul 2017 17:50:07 +0200 Subject: [PATCH 02/75] Add pure kernels for sha1(sha1($pass)) --- OpenCL/m04500_a0.cl | 252 ++++++++++++++++++++++++++++++++++++++++++ OpenCL/m04500_a1.cl | 228 ++++++++++++++++++++++++++++++++++++++ OpenCL/m04500_a3.cl | 262 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 742 insertions(+) create mode 100644 OpenCL/m04500_a0.cl create mode 100644 OpenCL/m04500_a1.cl create mode 100644 OpenCL/m04500_a3.cl diff --git a/OpenCL/m04500_a0.cl b/OpenCL/m04500_a0.cl new file mode 100644 index 000000000..5f2d0d62e --- /dev/null +++ b/OpenCL/m04500_a0.cl @@ -0,0 +1,252 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m04500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_swap (&ctx0, w, pw_len); + + sha1_final (&ctx0); + + const u32 a = ctx0.h[0]; + const u32 b = ctx0.h[1]; + const u32 c = ctx0.h[2]; + const u32 d = ctx0.h[3]; + const u32 e = ctx0.h[4]; + + sha1_ctx_t ctx; + + sha1_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + + ctx.len = 40; + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m04500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_swap (&ctx0, w, pw_len); + + sha1_final (&ctx0); + + const u32 a = ctx0.h[0]; + const u32 b = ctx0.h[1]; + const u32 c = ctx0.h[2]; + const u32 d = ctx0.h[3]; + const u32 e = ctx0.h[4]; + + sha1_ctx_t ctx; + + sha1_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + + ctx.len = 40; + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m04500_a1.cl b/OpenCL/m04500_a1.cl new file mode 100644 index 000000000..272495427 --- /dev/null +++ b/OpenCL/m04500_a1.cl @@ -0,0 +1,228 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m04500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx1 = ctx0; + + sha1_update_global_swap (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + const u32 e = ctx1.h[4]; + + sha1_ctx_t ctx; + + sha1_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + + ctx.len = 40; + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m04500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx1 = ctx0; + + sha1_update_global_swap (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + const u32 e = ctx1.h[4]; + + sha1_ctx_t ctx; + + sha1_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + + ctx.len = 40; + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m04500_a3.cl b/OpenCL/m04500_a3.cl new file mode 100644 index 000000000..c14543397 --- /dev/null +++ b/OpenCL/m04500_a3.cl @@ -0,0 +1,262 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m04500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx0; + + sha1_init_vector (&ctx0); + + sha1_update_vector (&ctx0, w, pw_len); + + sha1_final_vector (&ctx0); + + const u32x a = ctx0.h[0]; + const u32x b = ctx0.h[1]; + const u32x c = ctx0.h[2]; + const u32x d = ctx0.h[3]; + const u32x e = ctx0.h[4]; + + sha1_ctx_vector_t ctx; + + sha1_init_vector (&ctx); + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + + ctx.len = 40; + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m04500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx0; + + sha1_init_vector (&ctx0); + + sha1_update_vector (&ctx0, w, pw_len); + + sha1_final_vector (&ctx0); + + const u32x a = ctx0.h[0]; + const u32x b = ctx0.h[1]; + const u32x c = ctx0.h[2]; + const u32x d = ctx0.h[3]; + const u32x e = ctx0.h[4]; + + sha1_ctx_vector_t ctx; + + sha1_init_vector (&ctx); + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + + ctx.len = 40; + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} From 1fdb9d1d7e04c77f9290609009c2a0a5b5c25f8a Mon Sep 17 00:00:00 2001 From: jsteube Date: Thu, 20 Jul 2017 18:06:54 +0200 Subject: [PATCH 03/75] Add pure kernels for sha1($salt.sha1($pass)) --- OpenCL/m04520_a0.cl | 282 +++++++++++++++++++++++++++++++++++++++++ OpenCL/m04520_a1.cl | 258 ++++++++++++++++++++++++++++++++++++++ OpenCL/m04520_a3.cl | 296 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 836 insertions(+) create mode 100644 OpenCL/m04520_a0.cl create mode 100644 OpenCL/m04520_a1.cl create mode 100644 OpenCL/m04520_a3.cl diff --git a/OpenCL/m04520_a0.cl b/OpenCL/m04520_a0.cl new file mode 100644 index 000000000..5ac22b8ac --- /dev/null +++ b/OpenCL/m04520_a0.cl @@ -0,0 +1,282 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m04520_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx1; + + sha1_init (&ctx1); + + sha1_update_swap (&ctx1, w, pw_len); + + sha1_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + const u32 e = ctx1.h[4]; + + sha1_ctx_t ctx = ctx0; + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx, w0, w1, w2, w3, 40); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m04520_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx1; + + sha1_init (&ctx1); + + sha1_update_swap (&ctx1, w, pw_len); + + sha1_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + const u32 e = ctx1.h[4]; + + sha1_ctx_t ctx = ctx0; + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx, w0, w1, w2, w3, 40); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m04520_a1.cl b/OpenCL/m04520_a1.cl new file mode 100644 index 000000000..797380832 --- /dev/null +++ b/OpenCL/m04520_a1.cl @@ -0,0 +1,258 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m04520_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + sha1_ctx_t ctx1l; + + sha1_init (&ctx1l); + + sha1_update_global_swap (&ctx1l, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx1 = ctx1l; + + sha1_update_global_swap (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + const u32 e = ctx1.h[4]; + + sha1_ctx_t ctx = ctx0; + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx, w0, w1, w2, w3, 40); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m04520_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + sha1_ctx_t ctx1l; + + sha1_init (&ctx1l); + + sha1_update_global_swap (&ctx1l, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx1 = ctx1l; + + sha1_update_global_swap (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + const u32 e = ctx1.h[4]; + + sha1_ctx_t ctx = ctx0; + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx, w0, w1, w2, w3, 40); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m04520_a3.cl b/OpenCL/m04520_a3.cl new file mode 100644 index 000000000..dd633dda1 --- /dev/null +++ b/OpenCL/m04520_a3.cl @@ -0,0 +1,296 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m04520_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0lr = w0l | w0r; + + w[0] = w0lr; + + sha1_ctx_vector_t ctx1; + + sha1_init_vector (&ctx1); + + sha1_update_vector (&ctx1, w, pw_len); + + sha1_final_vector (&ctx1); + + const u32x a = ctx1.h[0]; + const u32x b = ctx1.h[1]; + const u32x c = ctx1.h[2]; + const u32x d = ctx1.h[3]; + const u32x e = ctx1.h[4]; + + sha1_ctx_vector_t ctx; + + sha1_init_vector_from_scalar (&ctx, &ctx0); + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_vector_64 (&ctx, w0, w1, w2, w3, 40); + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m04520_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0lr = w0l | w0r; + + w[0] = w0lr; + + sha1_ctx_vector_t ctx1; + + sha1_init_vector (&ctx1); + + sha1_update_vector (&ctx1, w, pw_len); + + sha1_final_vector (&ctx1); + + const u32x a = ctx1.h[0]; + const u32x b = ctx1.h[1]; + const u32x c = ctx1.h[2]; + const u32x d = ctx1.h[3]; + const u32x e = ctx1.h[4]; + + sha1_ctx_vector_t ctx; + + sha1_init_vector_from_scalar (&ctx, &ctx0); + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_vector_64 (&ctx, w0, w1, w2, w3, 40); + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} From 8f2cbb26dec52a0a33209fed3d146cb7fbbd1e33 Mon Sep 17 00:00:00 2001 From: jsteube Date: Thu, 20 Jul 2017 18:43:55 +0200 Subject: [PATCH 04/75] Update some salt lengths in interface.h --- include/interface.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/include/interface.h b/include/interface.h index 09c5199e6..0783c5e7c 100644 --- a/include/interface.h +++ b/include/interface.h @@ -1289,24 +1289,24 @@ typedef enum display_len DISPLAY_LEN_MIN_99999 = 1, DISPLAY_LEN_MAX_99999 = 55, - DISPLAY_LEN_MIN_11 = 32 + 1 + 16, - DISPLAY_LEN_MAX_11 = 32 + 1 + 32, - DISPLAY_LEN_MIN_12 = 32 + 1 + 1, + DISPLAY_LEN_MIN_11 = 32 + 1 + 0, + DISPLAY_LEN_MAX_11 = 32 + 1 + SALT_MAX, + DISPLAY_LEN_MIN_12 = 32 + 1 + 0, DISPLAY_LEN_MAX_12 = 32 + 1 + 32, - DISPLAY_LEN_MIN_21 = 32 + 1 + 1, - DISPLAY_LEN_MAX_21 = 32 + 1 + 15, + DISPLAY_LEN_MIN_21 = 32 + 1 + 2, + DISPLAY_LEN_MAX_21 = 32 + 1 + 2, DISPLAY_LEN_MIN_22 = 30 + 1 + 1, - DISPLAY_LEN_MAX_22 = 30 + 1 + 28, + DISPLAY_LEN_MAX_22 = 30 + 1 + 32, DISPLAY_LEN_MIN_23 = 32 + 1 + 0, DISPLAY_LEN_MAX_23 = 32 + 1 + SALT_MAX, DISPLAY_LEN_MIN_101 = 5 + 28, DISPLAY_LEN_MAX_101 = 5 + 28, - DISPLAY_LEN_MIN_111 = 6 + 28 + 0, - DISPLAY_LEN_MAX_111 = 6 + 28 + 40, + DISPLAY_LEN_MIN_111 = 6 + 28 + 1, + DISPLAY_LEN_MAX_111 = 6 + 28 + SALT_MAX, DISPLAY_LEN_MIN_112 = 40 + 1 + 20, DISPLAY_LEN_MAX_112 = 40 + 1 + 20, DISPLAY_LEN_MIN_121 = 40 + 1 + 1, - DISPLAY_LEN_MAX_121 = 40 + 1 + 32, + DISPLAY_LEN_MAX_121 = 40 + 1 + SALT_MAX, DISPLAY_LEN_MIN_122 = 8 + 40, DISPLAY_LEN_MAX_122 = 8 + 40, DISPLAY_LEN_MIN_124 = 4 + 1 + 0 + 1 + 40, @@ -1332,13 +1332,13 @@ typedef enum display_len DISPLAY_LEN_MIN_1731 = 128 + 6 + 0, DISPLAY_LEN_MAX_1731 = 128 + 6 + 16, DISPLAY_LEN_MIN_2611 = 32 + 1 + 0, - DISPLAY_LEN_MAX_2611 = 32 + 1 + 23, - DISPLAY_LEN_MIN_2612 = 6 + 0 + 1 + 32, - DISPLAY_LEN_MAX_2612 = 6 + 46 + 1 + 32, + DISPLAY_LEN_MAX_2611 = 32 + 1 + SALT_MAX, + DISPLAY_LEN_MIN_2612 = 6 + 0 + 1 + 32, + DISPLAY_LEN_MAX_2612 = 6 + SALT_MAX + 1 + 32, DISPLAY_LEN_MIN_2711 = 32 + 1 + 23, DISPLAY_LEN_MAX_2711 = 32 + 1 + 31, DISPLAY_LEN_MIN_2811 = 32 + 1 + 0, - DISPLAY_LEN_MAX_2811 = 32 + 1 + 31, + DISPLAY_LEN_MAX_2811 = 32 + 1 + SALT_MAX, DISPLAY_LEN_MIN_3711 = 3 + 0 + 1 + 32, DISPLAY_LEN_MAX_3711 = 3 + 31 + 1 + 32, DISPLAY_LEN_MIN_4521 = 40 + 1 + 32, From 3bc217ddb71d39d72dd9533fc9fafc1271f28a69 Mon Sep 17 00:00:00 2001 From: philsmd Date: Thu, 20 Jul 2017 23:40:11 +0200 Subject: [PATCH 05/75] tests: fix tests for -m 14000, 14100, 14900 and 15400 --- tools/test.sh | 377 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 297 insertions(+), 80 deletions(-) diff --git a/tools/test.sh b/tools/test.sh index 92cdc11fa..c753e30bc 100755 --- a/tools/test.sh +++ b/tools/test.sh @@ -237,30 +237,36 @@ function init() rm -rf ${OUTD}/${hash_type}_dict1 ${OUTD}/${hash_type}_dict2 touch ${OUTD}/${hash_type}_dict1 ${OUTD}/${hash_type}_dict2 - # foreach password entry split password in 2 (skip first entry, is len 1) - i=1 - # minimum password length - min_len=0 + min=1 # minimum line number from start of the file + min_offset=0 # minimum offset starting from ${min} lines if [ "${hash_type}" -eq 2500 ]; then - min_len=7 # means length 8, since we start with 0 + min_offset=7 # means length 8, since we start with 0 elif [ "${hash_type}" -eq 14000 ]; then - min_len=7 + min=0 + min_offset=4 elif [ "${hash_type}" -eq 14100 ]; then - min_len=23 + min=0 + min_offset=3 elif [ "${hash_type}" -eq 14900 ]; then - min_len=9 + min=0 + min_offset=5 elif [ "${hash_type}" -eq 15400 ]; then - min_len=31 + min=0 + min_offset=3 elif [ "${hash_type}" -eq 15800 ]; then min_len=7 fi + # foreach password entry split password in 2 (skip first entry, is len 1) + + i=1 + while read -u 9 pass; do - if [ ${i} -gt 1 ]; then + if [ ${i} -gt ${min} ]; then # split password, 'i' is the len p0=$((i / 2)) @@ -272,8 +278,8 @@ function init() if [ "${pass_len}" -gt 1 ] then - p1=$((p1 + ${min_len})) - p0=$((p0 + ${min_len})) + p1=$((p1 + ${min_offset})) + p0=$((p0 + ${min_offset})) if [ "${p1}" -gt ${pass_len} ]; then @@ -597,11 +603,23 @@ function attack_1() e_nm=0 cnt=0 + min=1 + + if [ "${hash_type}" -eq 14000 ]; then + min=0 + elif [ "${hash_type}" -eq 14100 ]; then + min=0 + elif [ "${hash_type}" -eq 14900 ]; then + min=0 + elif [ "${hash_type}" -eq 15400 ]; then + min=0 + fi + echo "> Testing hash type $hash_type with attack mode 1, markov ${MARKOV}, single hash, Device-Type ${TYPE}, vector-width ${VECTOR}." &>> ${OUTD}/logfull.txt i=1 while read -u 9 hash; do - if [ $i -gt 1 ]; then + if [ $i -gt ${min} ]; then if [ "${file_only}" -eq 1 ]; then @@ -623,7 +641,11 @@ function attack_1() if [ "${ret}" -eq 0 ]; then - line_nr=$((i - 1)) + line_nr=1 + + if [ "${i}" -gt 1 ]; then + line_nr=$((${i} - 1)) + fi line_dict1=$(sed -n ${line_nr}p ${OUTD}/${hash_type}_dict1) line_dict2=$(sed -n ${line_nr}p ${OUTD}/${hash_type}_dict2) @@ -671,6 +693,18 @@ function attack_1() # multihash if [ ${MODE} -ne 0 ]; then + # no multi hash checks for these modes (because we only have 1 hash for each of them) + + if [ "${hash_type}" -eq 14000 ]; then + return + elif [ "${hash_type}" -eq 14100 ]; then + return + elif [ "${hash_type}" -eq 14900 ]; then + return + elif [ "${hash_type}" -eq 15400 ]; then + return + fi + e_to=0 e_nf=0 e_nm=0 @@ -694,14 +728,6 @@ function attack_1() offset=7 elif [ ${hash_type} -eq 8500 ]; then offset=7 - elif [ ${hash_type} -eq 14000 ]; then - offset=7 - elif [ ${hash_type} -eq 14100 ]; then - offset=23 - elif [ ${hash_type} -eq 14900 ]; then - offset=9 - elif [ ${hash_type} -eq 15400 ]; then - offset=31 elif [ ${hash_type} -eq 15800 ]; then offset=7 fi @@ -743,7 +769,11 @@ function attack_1() while read -u 9 hash; do - line_nr=$((offset - i)) + line_nr=1 + + if [ "${offset}" -gt ${i} ]; then + line_nr=$((${offset} - ${i})) + fi line_dict1=$(tail -n ${line_nr} ${OUTD}/${hash_type}_dict1 | head -1) line_dict2=$(tail -n ${line_nr} ${OUTD}/${hash_type}_dict2 | head -1) @@ -818,22 +848,52 @@ function attack_3() mask_offset=7 max=7 elif [ "${hash_type}" -eq 14000 ]; then - mask_offset=7 - max=7 + mask_offset=4 + max=1 elif [ "${hash_type}" -eq 14100 ]; then - mask_offset=23 - max=23 + mask_offset=3 + max=1 elif [ "${hash_type}" -eq 14900 ]; then - mask_offset=9 - max=9 + mask_offset=5 + max=1 elif [ "${hash_type}" -eq 15400 ]; then - mask_offset=31 - max=31 + mask_offset=3 + max=1 elif [ "${hash_type}" -eq 15800 ]; then mask_offset=7 max=7 fi + # special case: we need to split the first line + + if [ "${mask_offset}" -ne 0 ]; then + + pass=$(sed -n 1p ${OUTD}/${hash_type}_passwords.txt) + + pass_part_2=$(echo -n ${pass} | cut -b $((${mask_offset} + 1))-) + + mask_custom="" + + if [ "${hash_type}" -eq 14000 ]; then + + mask_custom="${pass}" + + elif [ "${hash_type}" -eq 14100 ]; then + + mask_custom="${pass}" + + else + + for i in $(seq 1 ${mask_offset}); do + mask_custom="${mask_custom}?d" + done + + mask_custom="${mask_custom}${pass_part_2}" + + fi + + fi + i=1 while read -u 9 hash; do @@ -842,7 +902,7 @@ function attack_3() if ! contains ${hash_type} ${TIMEOUT_ALGOS}; then - break; + break fi @@ -857,12 +917,13 @@ function attack_3() fi mask=${mask_3[$((i + ${mask_offset}))]} + dict="${OUTD}/${hash_type}_passwords.txt" # modify "default" mask if needed (and set custom charset to reduce keyspace) if [ "${hash_type}" -eq 2500 ] || [ "${hash_type}" -eq 15800 ]; then - pass=$(sed -n ${i}p ${OUTD}/${hash_type}_passwords.txt) + pass=$(sed -n ${i}p ${dict}) mask=${pass} @@ -882,6 +943,10 @@ function attack_3() fi + if [ "${mask_offset}" -ne 0 ]; then + mask=${mask_custom} + fi + CMD="./${BIN} ${OPTS} -a 3 -m ${hash_type} '${hash}' ${mask}" echo -n "[ len $i ] " &>> ${OUTD}/logfull.txt @@ -894,7 +959,7 @@ function attack_3() if [ "${ret}" -eq 0 ]; then - line_dict=$(sed -n ${i}p ${OUTD}/${hash_type}_passwords.txt) + line_dict=$(sed -n ${i}p ${dict}) if [ ${pass_only} -eq 1 ]; then search=":${line_dict}" @@ -939,6 +1004,18 @@ function attack_3() # multihash if [ ${MODE} -ne 0 ]; then + # no multi hash checks for these modes (because we only have 1 hash for each of them) + + if [ "${hash_type}" -eq 14000 ]; then + return + elif [ "${hash_type}" -eq 14100 ]; then + return + elif [ "${hash_type}" -eq 14900 ]; then + return + elif [ "${hash_type}" -eq 15400 ]; then + return + fi + e_to=0 e_nf=0 e_nm=0 @@ -957,18 +1034,6 @@ function attack_3() if [ "${hash_type}" -eq 2500 ]; then increment_min=8 increment_max=9 - elif [ "${hash_type}" -eq 14000 ]; then - increment_min=8 - increment_max=8 - elif [ "${hash_type}" -eq 14100 ]; then - increment_min=24 - increment_max=24 - elif [ "${hash_type}" -eq 14900 ]; then - increment_min=10 - increment_max=10 - elif [ "${hash_type}" -eq 15400 ]; then - increment_min=32 - increment_max=32 elif [ "${hash_type}" -eq 15800 ]; then increment_min=8 increment_max=9 @@ -1170,37 +1235,84 @@ function attack_6() echo "> Testing hash type $hash_type with attack mode 6, markov ${MARKOV}, single hash, Device-Type ${TYPE}, vector-width ${VECTOR}." &>> ${OUTD}/logfull.txt - i=1 - + min=1 max=8 + mask_offset=0 if [ "${hash_type}" -eq 2500 ]; then max=6 elif [ "${hash_type}" -eq 14000 ]; then - max=6 + min=0 + max=1 + mask_offset=4 elif [ "${hash_type}" -eq 14100 ]; then - max=6 + min=0 + max=1 + mask_offset=21 elif [ "${hash_type}" -eq 14900 ]; then - max=6 + min=0 + max=1 + mask_offset=5 elif [ "${hash_type}" -eq 15400 ]; then - max=6 + min=0 + max=1 + mask_offset=29 elif [ "${hash_type}" -eq 15800 ]; then max=6 fi + # special case: we need to split the first line + + if [ "${min}" -eq 0 ]; then + + pass_part_1=$(sed -n 1p ${OUTD}/${hash_type}_dict1) + pass_part_2=$(sed -n 1p ${OUTD}/${hash_type}_dict2) + + pass="${pass_part_1}${pass_part_2}" + + echo -n ${pass} | cut -b -$((${mask_offset} + 0)) > ${OUTD}/${hash_type}_dict1_custom + echo -n ${pass} | cut -b $((${mask_offset} + 1))- > ${OUTD}/${hash_type}_dict2_custom + + mask_custom="" + + for i in $(seq 1 $((${#pass} - ${mask_offset}))); do + + if [ "${hash_type}" -eq 14000 ]; then + + char=$(echo -n ${pass} | cut -b $((${i} + ${mask_offset}))) + mask_custom="${mask_custom}${char}" + + elif [ "${hash_type}" -eq 14100 ]; then + + char=$(echo -n ${pass} | cut -b $((${i} + ${mask_offset}))) + mask_custom="${mask_custom}${char}" + + else + + mask_custom="${mask_custom}?d" + + fi + + done + + fi + + + i=1 + while read -u 9 hash; do if [ "${i}" -gt 6 ]; then if ! contains ${hash_type} ${TIMEOUT_ALGOS}; then - break; + break fi fi - if [ $i -gt 1 ]; then + if [ ${i} -gt ${min} ]; then if [ "${file_only}" -eq 1 ]; then @@ -1210,11 +1322,23 @@ function attack_6() fi - CMD="./${BIN} ${OPTS} -a 6 -m ${hash_type} '${hash}' ${OUTD}/${hash_type}_dict1 ${mask_6[$i]}" + mask=${mask_6[${i}]} + + dict1=${OUTD}/${hash_type}_dict1 + dict2=${OUTD}/${hash_type}_dict2 + + if [ "${min}" -eq 0 ]; then + mask=${mask_custom} + + dict1=${OUTD}/${hash_type}_dict1_custom + dict2=${OUTD}/${hash_type}_dict2_custom + fi + + CMD="./${BIN} ${OPTS} -a 6 -m ${hash_type} '${hash}' ${dict1} ${mask}" echo -n "[ len $i ] " &>> ${OUTD}/logfull.txt - output=$(./${BIN} ${OPTS} -a 6 -m ${hash_type} "${hash}" ${OUTD}/${hash_type}_dict1 ${mask_6[$i]} 2>&1) + output=$(./${BIN} ${OPTS} -a 6 -m ${hash_type} "${hash}" ${dict1} ${mask} 2>&1) ret=${?} @@ -1222,10 +1346,14 @@ function attack_6() if [ "${ret}" -eq 0 ]; then - line_nr=$((i - 1)) + line_nr=1 - line_dict1=$(sed -n ${line_nr}p ${OUTD}/${hash_type}_dict1) - line_dict2=$(sed -n ${line_nr}p ${OUTD}/${hash_type}_dict2) + if [ "${i}" -gt 1 ]; then + line_nr=$((${i} - 1)) + fi + + line_dict1=$(sed -n ${line_nr}p ${dict1}) + line_dict2=$(sed -n ${line_nr}p ${dict2}) if [ ${pass_only} -eq 1 ]; then search=":${line_dict1}${line_dict2}" @@ -1267,11 +1395,26 @@ function attack_6() echo "[ ${OUTD} ] [ Type ${hash_type}, Attack 6, Mode single, Device-Type ${TYPE}, Vector-Width ${VECTOR} ] > $msg : ${e_nf}/${cnt} not found, ${e_nm}/${cnt} not matched, ${e_to}/${cnt} timeout" + rm -f ${OUTD}/${hash_type}_dict1_custom + rm -f ${OUTD}/${hash_type}_dict2_custom + fi # multihash if [ ${MODE} -ne 0 ]; then + # no multi hash checks for these modes (because we only have 1 hash for each of them) + + if [ "${hash_type}" -eq 14000 ]; then + return + elif [ "${hash_type}" -eq 14100 ]; then + return + elif [ "${hash_type}" -eq 14900 ]; then + return + elif [ "${hash_type}" -eq 15400 ]; then + return + fi + e_to=0 e_nf=0 e_nm=0 @@ -1287,14 +1430,6 @@ function attack_6() max=8 elif [ ${hash_type} -eq 8500 ]; then max=8 - elif [ ${hash_type} -eq 14000 ]; then - max=5 - elif [ ${hash_type} -eq 14100 ]; then - max=5 - elif [ ${hash_type} -eq 14900 ]; then - max=5 - elif [ ${hash_type} -eq 15400 ]; then - max=5 elif [ ${hash_type} -eq 15800 ]; then max=5 fi @@ -1332,11 +1467,13 @@ function attack_6() fi - CMD="./${BIN} ${OPTS} -a 6 -m ${hash_type} ${hash_file} ${OUTD}/${hash_type}_dict1_multi_${i} ${mask_6[$i]}" + mask=${mask_6[$i]} + + CMD="./${BIN} ${OPTS} -a 6 -m ${hash_type} ${hash_file} ${OUTD}/${hash_type}_dict1_multi_${i} ${mask}" echo "> Testing hash type $hash_type with attack mode 6, markov ${MARKOV}, multi hash with word len ${i}." &>> ${OUTD}/logfull.txt - output=$(./${BIN} ${OPTS} -a 6 -m ${hash_type} ${hash_file} ${OUTD}/${hash_type}_dict1_multi_${i} ${mask_6[$i]} 2>&1) + output=$(./${BIN} ${OPTS} -a 6 -m ${hash_type} ${hash_file} ${OUTD}/${hash_type}_dict1_multi_${i} ${mask} 2>&1) ret=${?} @@ -1414,27 +1551,74 @@ function attack_7() echo "> Testing hash type $hash_type with attack mode 7, markov ${MARKOV}, single hash, Device-Type ${TYPE}, vector-width ${VECTOR}." &>> ${OUTD}/logfull.txt + min=1 max=8 + mask_offset=0 + if [ "${hash_type}" -eq 2500 ]; then max=5 elif [ "${hash_type}" -eq 14000 ]; then - max=5 + mask_offset=4 + min=0 + max=1 elif [ "${hash_type}" -eq 14100 ]; then - max=5 + mask_offset=3 + min=0 + max=1 elif [ "${hash_type}" -eq 14900 ]; then - max=5 + mask_offset=5 + min=0 + max=1 elif [ "${hash_type}" -eq 15400 ]; then - max=5 + mask_offset=3 + min=0 + max=1 elif [ "${hash_type}" -eq 15800 ]; then max=5 fi + # special case: we need to split the first line + + if [ "${min}" -eq 0 ]; then + + pass_part_1=$(sed -n 1p ${OUTD}/${hash_type}_dict1) + pass_part_2=$(sed -n 1p ${OUTD}/${hash_type}_dict2) + + pass="${pass_part_1}${pass_part_2}" + + echo -n ${pass} | cut -b -$((${mask_offset} + 0)) > ${OUTD}/${hash_type}_dict1_custom + echo -n ${pass} | cut -b $((${mask_offset} + 1))- > ${OUTD}/${hash_type}_dict2_custom + + mask_custom="" + + for i in $(seq 1 ${mask_offset}); do + + if [ "${hash_type}" -eq 14000 ]; then + + char=$(echo -n ${pass} | cut -b ${i}) + mask_custom="${mask_custom}${char}" + + elif [ "${hash_type}" -eq 14100 ]; then + + char=$(echo -n ${pass} | cut -b ${i}) + mask_custom="${mask_custom}${char}" + + else + + mask_custom="${mask_custom}?d" + + fi + + done + + fi + i=1 while read -u 9 hash; do - if [ $i -gt 1 ]; then + if [ ${i} -gt ${min} ]; then if [ "${file_only}" -eq 1 ]; then @@ -1450,7 +1634,11 @@ function attack_7() if [ "${hash_type}" -eq 2500 ] || [ "${hash_type}" -eq 15800 ]; then - line_nr=$((i - 1)) + line_nr=1 + + if [ "${i}" -gt 1 ]; then + line_nr=$((${i} - 1)) + fi pass_part_1=$(sed -n ${line_nr}p ${OUTD}/${hash_type}_dict1) pass_part_2=$(sed -n ${line_nr}p ${OUTD}/${hash_type}_dict2) @@ -1470,11 +1658,21 @@ function attack_7() fi - CMD="./${BIN} ${OPTS} -a 7 -m ${hash_type} '${hash}' ${mask} ${OUTD}/${hash_type}_dict2" + dict1=${OUTD}/${hash_type}_dict1 + dict2=${OUTD}/${hash_type}_dict2 + + if [ "${min}" -eq 0 ]; then + mask=${mask_custom} + + dict1=${OUTD}/${hash_type}_dict1_custom + dict2=${OUTD}/${hash_type}_dict2_custom + fi + + CMD="./${BIN} ${OPTS} -a 7 -m ${hash_type} '${hash}' ${mask} ${dict2}" echo -n "[ len $i ] " &>> ${OUTD}/logfull.txt - output=$(./${BIN} ${OPTS} -a 7 -m ${hash_type} "${hash}" ${mask} ${OUTD}/${hash_type}_dict2 2>&1) + output=$(./${BIN} ${OPTS} -a 7 -m ${hash_type} "${hash}" ${mask} ${dict2} 2>&1) ret=${?} @@ -1482,10 +1680,14 @@ function attack_7() if [ "${ret}" -eq 0 ]; then - line_nr=$((i - 1)) + line_nr=1 - line_dict1=$(sed -n ${line_nr}p ${OUTD}/${hash_type}_dict1) - line_dict2=$(sed -n ${line_nr}p ${OUTD}/${hash_type}_dict2) + if [ "${i}" -gt 1 ]; then + line_nr=$((${i} - 1)) + fi + + line_dict1=$(sed -n ${line_nr}p ${dict1}) + line_dict2=$(sed -n ${line_nr}p ${dict2}) if [ ${pass_only} -eq 1 ]; then search=":${line_dict1}${line_dict2}" @@ -1527,11 +1729,26 @@ function attack_7() echo "[ ${OUTD} ] [ Type ${hash_type}, Attack 7, Mode single, Device-Type ${TYPE}, Vector-Width ${VECTOR} ] > $msg : ${e_nf}/${cnt} not found, ${e_nm}/${cnt} not matched, ${e_to}/${cnt} timeout" + rm -f ${OUTD}/${hash_type}_dict1_custom + rm -f ${OUTD}/${hash_type}_dict2_custom + fi # multihash if [ ${MODE} -ne 0 ]; then + # no multi hash checks for these modes (because we only have 1 hash for each of them) + + if [ "${hash_type}" -eq 14000 ]; then + return + elif [ "${hash_type}" -eq 14100 ]; then + return + elif [ "${hash_type}" -eq 14900 ]; then + return + elif [ "${hash_type}" -eq 15400 ]; then + return + fi + e_to=0 e_nf=0 e_nm=0 From 9515927cf7980d8eba152ae720bdc521ee298fae Mon Sep 17 00:00:00 2001 From: jsteube Date: Fri, 21 Jul 2017 15:44:29 +0200 Subject: [PATCH 06/75] Add pure kernels for sha1(md5()) --- OpenCL/m04700_a0.cl | 245 ++++++++++++++++++++++++++++++++++++++++++ OpenCL/m04700_a1.cl | 221 ++++++++++++++++++++++++++++++++++++++ OpenCL/m04700_a3.cl | 253 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 719 insertions(+) create mode 100644 OpenCL/m04700_a0.cl create mode 100644 OpenCL/m04700_a1.cl create mode 100644 OpenCL/m04700_a3.cl diff --git a/OpenCL/m04700_a0.cl b/OpenCL/m04700_a0.cl new file mode 100644 index 000000000..bb2374eaf --- /dev/null +++ b/OpenCL/m04700_a0.cl @@ -0,0 +1,245 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m04700_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update (&ctx0, w, pw_len); + + md5_final (&ctx0); + + const u32 a = ctx0.h[0]; + const u32 b = ctx0.h[1]; + const u32 c = ctx0.h[2]; + const u32 d = ctx0.h[3]; + const u32 e = ctx0.h[4]; + + sha1_ctx_t ctx; + + sha1_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 0) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 16) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 0) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 16) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 0) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 16) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 0) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 16) & 255) << 16; + + ctx.len = 32; + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m04700_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update (&ctx0, w, pw_len); + + md5_final (&ctx0); + + const u32 a = ctx0.h[0]; + const u32 b = ctx0.h[1]; + const u32 c = ctx0.h[2]; + const u32 d = ctx0.h[3]; + const u32 e = ctx0.h[4]; + + sha1_ctx_t ctx; + + sha1_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 0) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 16) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 0) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 16) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 0) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 16) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 0) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 16) & 255) << 16; + + ctx.len = 32; + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m04700_a1.cl b/OpenCL/m04700_a1.cl new file mode 100644 index 000000000..9a1ab14a9 --- /dev/null +++ b/OpenCL/m04700_a1.cl @@ -0,0 +1,221 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m04700_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md5_ctx_t ctx1 = ctx0; + + md5_update_global (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md5_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + const u32 e = ctx1.h[4]; + + sha1_ctx_t ctx; + + sha1_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 0) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 16) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 0) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 16) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 0) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 16) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 0) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 16) & 255) << 16; + + ctx.len = 32; + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m04700_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md5_ctx_t ctx1 = ctx0; + + md5_update_global (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md5_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + const u32 e = ctx1.h[4]; + + sha1_ctx_t ctx; + + sha1_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 0) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 16) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 0) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 16) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 0) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 16) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 0) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 16) & 255) << 16; + + ctx.len = 32; + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m04700_a3.cl b/OpenCL/m04700_a3.cl new file mode 100644 index 000000000..9b00733ed --- /dev/null +++ b/OpenCL/m04700_a3.cl @@ -0,0 +1,253 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_md5.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m04700_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md5_ctx_vector_t ctx0; + + md5_init_vector (&ctx0); + + md5_update_vector (&ctx0, w, pw_len); + + md5_final_vector (&ctx0); + + const u32x a = ctx0.h[0]; + const u32x b = ctx0.h[1]; + const u32x c = ctx0.h[2]; + const u32x d = ctx0.h[3]; + + sha1_ctx_vector_t ctx; + + sha1_init_vector (&ctx); + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 0) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 16) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 0) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 16) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 0) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 16) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 0) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 16) & 255) << 16; + + ctx.len = 32; + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m04700_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md5_ctx_vector_t ctx0; + + md5_init_vector (&ctx0); + + md5_update_vector (&ctx0, w, pw_len); + + md5_final_vector (&ctx0); + + const u32x a = ctx0.h[0]; + const u32x b = ctx0.h[1]; + const u32x c = ctx0.h[2]; + const u32x d = ctx0.h[3]; + + sha1_ctx_vector_t ctx; + + sha1_init_vector (&ctx); + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 0) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 16) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 0) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 16) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 0) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 16) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 0) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 16) & 255) << 16; + + ctx.len = 32; + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} From e9821a01bacc9198a4b1030968c262fdba67b5b4 Mon Sep 17 00:00:00 2001 From: jsteube Date: Fri, 21 Jul 2017 16:42:55 +0200 Subject: [PATCH 07/75] Add pure kernels for sha1($salt.$pass.$salt) --- OpenCL/m04900_a0.cl | 172 ++++++++++++++++++++++++++++++++++++++++ OpenCL/m04900_a1.cl | 144 ++++++++++++++++++++++++++++++++++ OpenCL/m04900_a3.cl | 186 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 502 insertions(+) create mode 100644 OpenCL/m04900_a0.cl create mode 100644 OpenCL/m04900_a1.cl create mode 100644 OpenCL/m04900_a3.cl diff --git a/OpenCL/m04900_a0.cl b/OpenCL/m04900_a0.cl new file mode 100644 index 000000000..b323e6e6e --- /dev/null +++ b/OpenCL/m04900_a0.cl @@ -0,0 +1,172 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m04900_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + + const u32 salt_lenv = ceil ((float) salt_len / 4); + + u32 s[64] = { 0 }; + + for (int idx = 0; idx < salt_lenv; idx++) + { + s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update (&ctx0, s, salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx = ctx0; + + sha1_update_swap (&ctx, w, pw_len); + + sha1_update (&ctx, s, salt_len); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m04900_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + + const u32 salt_lenv = ceil ((float) salt_len / 4); + + u32 s[64] = { 0 }; + + for (int idx = 0; idx < salt_lenv; idx++) + { + s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update (&ctx0, s, salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx = ctx0; + + sha1_update_swap (&ctx, w, pw_len); + + sha1_update (&ctx, s, salt_len); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m04900_a1.cl b/OpenCL/m04900_a1.cl new file mode 100644 index 000000000..a13057b0e --- /dev/null +++ b/OpenCL/m04900_a1.cl @@ -0,0 +1,144 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m04900_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + + const u32 salt_lenv = ceil ((float) salt_len / 4); + + u32 s[64] = { 0 }; + + for (int idx = 0; idx < salt_lenv; idx++) + { + s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update (&ctx0, s, salt_len); + + sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx = ctx0; + + sha1_update_global_swap (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_update (&ctx, s, salt_len); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m04900_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + + const u32 salt_lenv = ceil ((float) salt_len / 4); + + u32 s[64] = { 0 }; + + for (int idx = 0; idx < salt_lenv; idx++) + { + s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update (&ctx0, s, salt_len); + + sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx = ctx0; + + sha1_update_global_swap (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_update (&ctx, s, salt_len); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m04900_a3.cl b/OpenCL/m04900_a3.cl new file mode 100644 index 000000000..6b365a6f8 --- /dev/null +++ b/OpenCL/m04900_a3.cl @@ -0,0 +1,186 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_sha1.cl" + +__kernel void m04900_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + + const u32 salt_lenv = ceil ((float) salt_len / 4); + + u32x s[64] = { 0 }; + + for (int idx = 0; idx < salt_lenv; idx++) + { + s[idx] = swap32 (salt_bufs[salt_pos].salt_buf[idx]); + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx; + + sha1_init_vector_from_scalar (&ctx, &ctx0); + + sha1_update_vector_swap (&ctx, w, pw_len); + + sha1_update_vector (&ctx, s, salt_len); + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m04900_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + + const u32 salt_lenv = ceil ((float) salt_len / 4); + + u32x s[64] = { 0 }; + + for (int idx = 0; idx < salt_lenv; idx++) + { + s[idx] = swap32 (salt_bufs[salt_pos].salt_buf[idx]); + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx; + + sha1_init_vector_from_scalar (&ctx, &ctx0); + + sha1_update_vector_swap (&ctx, w, pw_len); + + sha1_update_vector (&ctx, s, salt_len); + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} From 27edc07c2f324bc67a8231105f449b510e9da341 Mon Sep 17 00:00:00 2001 From: jsteube Date: Fri, 21 Jul 2017 16:59:10 +0200 Subject: [PATCH 08/75] Add pure kernels for iSCSI CHAP authentication, MD5(CHAP) --- OpenCL/m04800_a0.cl | 168 ++++++++++++++++++++++++++++++++++++++++ OpenCL/m04800_a1.cl | 140 ++++++++++++++++++++++++++++++++++ OpenCL/m04800_a3.cl | 182 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 490 insertions(+) create mode 100644 OpenCL/m04800_a0.cl create mode 100644 OpenCL/m04800_a1.cl create mode 100644 OpenCL/m04800_a3.cl diff --git a/OpenCL/m04800_a0.cl b/OpenCL/m04800_a0.cl new file mode 100644 index 000000000..02c9903fa --- /dev/null +++ b/OpenCL/m04800_a0.cl @@ -0,0 +1,168 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" + +__kernel void m04800_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + const u32 salt_len = salt_bufs[salt_pos].salt_len - 1; + + u32 s[16] = { 0 }; + + s[0] = salt_bufs[salt_pos].salt_buf[0]; + s[1] = salt_bufs[salt_pos].salt_buf[1]; + s[2] = salt_bufs[salt_pos].salt_buf[2]; + s[3] = salt_bufs[salt_pos].salt_buf[3]; + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + ctx0.w0[0] = salt_bufs[salt_pos].salt_buf[4]; + + ctx0.len = 1; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_ctx_t ctx = ctx0; + + md5_update (&ctx, w, pw_len); + + md5_update_vector (&ctx, s, salt_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m04800_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + const u32 salt_len = salt_bufs[salt_pos].salt_len - 1; + + u32 s[16] = { 0 }; + + s[0] = salt_bufs[salt_pos].salt_buf[0]; + s[1] = salt_bufs[salt_pos].salt_buf[1]; + s[2] = salt_bufs[salt_pos].salt_buf[2]; + s[3] = salt_bufs[salt_pos].salt_buf[3]; + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + ctx0.w0[0] = salt_bufs[salt_pos].salt_buf[4]; + + ctx0.len = 1; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_ctx_t ctx = ctx0; + + md5_update (&ctx, w, pw_len); + + md5_update_vector (&ctx, s, salt_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m04800_a1.cl b/OpenCL/m04800_a1.cl new file mode 100644 index 000000000..d7e968d1c --- /dev/null +++ b/OpenCL/m04800_a1.cl @@ -0,0 +1,140 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" + +__kernel void m04800_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 salt_len = salt_bufs[salt_pos].salt_len - 1; + + u32 s[16] = { 0 }; + + s[0] = salt_bufs[salt_pos].salt_buf[0]; + s[1] = salt_bufs[salt_pos].salt_buf[1]; + s[2] = salt_bufs[salt_pos].salt_buf[2]; + s[3] = salt_bufs[salt_pos].salt_buf[3]; + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + ctx0.w0[0] = salt_bufs[salt_pos].salt_buf[4]; + + ctx0.len = 1; + + md5_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md5_ctx_t ctx = ctx0; + + md5_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md5_update_vector (&ctx, s, salt_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m04800_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 salt_len = salt_bufs[salt_pos].salt_len - 1; + + u32 s[16] = { 0 }; + + s[0] = salt_bufs[salt_pos].salt_buf[0]; + s[1] = salt_bufs[salt_pos].salt_buf[1]; + s[2] = salt_bufs[salt_pos].salt_buf[2]; + s[3] = salt_bufs[salt_pos].salt_buf[3]; + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + ctx0.w0[0] = salt_bufs[salt_pos].salt_buf[4]; + + ctx0.len = 1; + + md5_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md5_ctx_t ctx = ctx0; + + md5_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md5_update_vector (&ctx, s, salt_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m04800_a3.cl b/OpenCL/m04800_a3.cl new file mode 100644 index 000000000..e69628fbb --- /dev/null +++ b/OpenCL/m04800_a3.cl @@ -0,0 +1,182 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_md5.cl" + +__kernel void m04800_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + const u32 salt_len = salt_bufs[salt_pos].salt_len - 1; + + u32x s[16] = { 0 }; + + s[0] = salt_bufs[salt_pos].salt_buf[0]; + s[1] = salt_bufs[salt_pos].salt_buf[1]; + s[2] = salt_bufs[salt_pos].salt_buf[2]; + s[3] = salt_bufs[salt_pos].salt_buf[3]; + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + ctx0.w0[0] = salt_bufs[salt_pos].salt_buf[4]; + + ctx0.len = 1; + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md5_ctx_vector_t ctx; + + md5_init_vector_from_scalar (&ctx, &ctx0); + + md5_update_vector (&ctx, w, pw_len); + + md5_update_vector (&ctx, s, salt_len); + + md5_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m04800_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + const u32 salt_len = salt_bufs[salt_pos].salt_len - 1; + + u32x s[16] = { 0 }; + + s[0] = salt_bufs[salt_pos].salt_buf[0]; + s[1] = salt_bufs[salt_pos].salt_buf[1]; + s[2] = salt_bufs[salt_pos].salt_buf[2]; + s[3] = salt_bufs[salt_pos].salt_buf[3]; + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + ctx0.w0[0] = salt_bufs[salt_pos].salt_buf[4]; + + ctx0.len = 1; + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md5_ctx_vector_t ctx; + + md5_init_vector_from_scalar (&ctx, &ctx0); + + md5_update_vector (&ctx, w, pw_len); + + md5_update_vector (&ctx, s, salt_len); + + md5_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} From 5e34ec348e68a1efae45b32949162a7c2aebd713 Mon Sep 17 00:00:00 2001 From: jsteube Date: Sat, 22 Jul 2017 18:05:18 +0200 Subject: [PATCH 09/75] Optimize kernels for ROCm 1.6 - Remove inline keywords - Remove volatile keywords where it causes ROCm to slow down - Replace DES functions (looks like bitselect somehow is no longer mapped to BFI_INT) --- OpenCL/inc_common.cl | 118 ++--- OpenCL/inc_hash_md4.cl | 4 +- OpenCL/inc_hash_md5.cl | 4 +- OpenCL/inc_hash_ripemd160.cl | 4 +- OpenCL/inc_hash_sha1.cl | 4 +- OpenCL/inc_hash_sha224.cl | 4 +- OpenCL/inc_hash_sha256.cl | 4 +- OpenCL/inc_hash_sha384.cl | 4 +- OpenCL/inc_hash_sha512.cl | 4 +- OpenCL/inc_hash_whirlpool.cl | 4 +- OpenCL/inc_rp.cl | 106 ++-- OpenCL/inc_simd.cl | 6 +- OpenCL/inc_types.cl | 134 +++-- OpenCL/inc_vendor.cl | 6 +- OpenCL/m01500_a3.cl | 950 ++++++++++++++++++---------------- OpenCL/m02501.cl | 4 +- OpenCL/m03000_a3.cl | 905 ++++++++++++++++---------------- OpenCL/m14000_a3-optimized.cl | 897 ++++++++++++++++---------------- OpenCL/markov_be.cl | 2 +- OpenCL/markov_le.cl | 2 +- 20 files changed, 1622 insertions(+), 1544 deletions(-) diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl index 8d20c2040..f50e073f9 100644 --- a/OpenCL/inc_common.cl +++ b/OpenCL/inc_common.cl @@ -7,7 +7,7 @@ * pure scalar functions */ -inline int ffz (const u32 v) +int ffz (const u32 v) { #ifdef _unroll #pragma unroll @@ -22,7 +22,7 @@ inline int ffz (const u32 v) return -1; } -inline int hash_comp (const u32 d1[4], __global const u32 *d2) +int hash_comp (const u32 d1[4], __global const u32 *d2) { if (d1[3] > d2[DGST_R3]) return ( 1); if (d1[3] < d2[DGST_R3]) return (-1); @@ -36,7 +36,7 @@ inline int hash_comp (const u32 d1[4], __global const u32 *d2) return (0); } -inline int find_hash (const u32 digest[4], const u32 digests_cnt, __global const digest_t *digests_buf) +int find_hash (const u32 digest[4], const u32 digests_cnt, __global const digest_t *digests_buf) { for (u32 l = 0, r = digests_cnt; r; r >>= 1) { @@ -59,12 +59,12 @@ inline int find_hash (const u32 digest[4], const u32 digests_cnt, __global const return (-1); } -inline u32 check_bitmap (__global const u32 *bitmap, const u32 bitmap_mask, const u32 bitmap_shift, const u32 digest) +u32 check_bitmap (__global const u32 *bitmap, const u32 bitmap_mask, const u32 bitmap_shift, const u32 digest) { return (bitmap[(digest >> bitmap_shift) & bitmap_mask] & (1 << (digest & 0x1f))); } -inline u32 check (const u32 digest[4], __global const u32 *bitmap_s1_a, __global const u32 *bitmap_s1_b, __global const u32 *bitmap_s1_c, __global const u32 *bitmap_s1_d, __global const u32 *bitmap_s2_a, __global const u32 *bitmap_s2_b, __global const u32 *bitmap_s2_c, __global const u32 *bitmap_s2_d, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2) +u32 check (const u32 digest[4], __global const u32 *bitmap_s1_a, __global const u32 *bitmap_s1_b, __global const u32 *bitmap_s1_c, __global const u32 *bitmap_s1_d, __global const u32 *bitmap_s2_a, __global const u32 *bitmap_s2_b, __global const u32 *bitmap_s2_c, __global const u32 *bitmap_s2_d, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2) { if (check_bitmap (bitmap_s1_a, bitmap_mask, bitmap_shift1, digest[0]) == 0) return (0); if (check_bitmap (bitmap_s1_b, bitmap_mask, bitmap_shift1, digest[1]) == 0) return (0); @@ -79,7 +79,7 @@ inline u32 check (const u32 digest[4], __global const u32 *bitmap_s1_a, __global return (1); } -inline void mark_hash (__global plain_t *plains_buf, __global u32 *d_result, const u32 salt_pos, const u32 digests_cnt, const u32 digest_pos, const u32 hash_pos, const u32 gid, const u32 il_pos) +void mark_hash (__global plain_t *plains_buf, __global u32 *d_result, const u32 salt_pos, const u32 digests_cnt, const u32 digest_pos, const u32 hash_pos, const u32 gid, const u32 il_pos) { const u32 idx = atomic_inc (d_result); @@ -100,7 +100,7 @@ inline void mark_hash (__global plain_t *plains_buf, __global u32 *d_result, con plains_buf[idx].il_pos = il_pos; } -inline int count_char (const u32 *buf, const int elems, const u32 c) +int count_char (const u32 *buf, const int elems, const u32 c) { int r = 0; @@ -117,7 +117,7 @@ inline int count_char (const u32 *buf, const int elems, const u32 c) return r; } -inline float get_entropy (const u32 *buf, const int elems) +float get_entropy (const u32 *buf, const int elems) { const int length = elems * 4; @@ -144,7 +144,7 @@ inline float get_entropy (const u32 *buf, const int elems) * vector functions */ -inline void truncate_block_4x4_le (u32x w0[4], const u32 len) +void truncate_block_4x4_le (u32x w0[4], const u32 len) { switch (len) { @@ -254,7 +254,7 @@ inline void truncate_block_4x4_le (u32x w0[4], const u32 len) } } -inline void truncate_block_16x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 len) +void truncate_block_16x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 len) { switch (len) { @@ -1060,7 +1060,7 @@ inline void truncate_block_16x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[ } } -inline void truncate_block_4x4_be (u32x w0[4], const u32 len) +void truncate_block_4x4_be (u32x w0[4], const u32 len) { switch (len) { @@ -1170,7 +1170,7 @@ inline void truncate_block_4x4_be (u32x w0[4], const u32 len) } } -inline void truncate_block_16x4_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 len) +void truncate_block_16x4_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 len) { switch (len) { @@ -1976,7 +1976,7 @@ inline void truncate_block_16x4_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[ } } -inline void make_utf16be (const u32x in[4], u32x out1[4], u32x out2[4]) +void make_utf16be (const u32x in[4], u32x out1[4], u32x out2[4]) { #ifdef IS_NV out2[3] = __byte_perm (in[3], 0, 0x3727); @@ -2001,7 +2001,7 @@ inline void make_utf16be (const u32x in[4], u32x out1[4], u32x out2[4]) #endif } -inline void make_utf16beN (const u32x in[4], u32x out1[4], u32x out2[4]) +void make_utf16beN (const u32x in[4], u32x out1[4], u32x out2[4]) { #ifdef IS_NV out2[3] = __byte_perm (in[3], 0, 0x1707); @@ -2026,7 +2026,7 @@ inline void make_utf16beN (const u32x in[4], u32x out1[4], u32x out2[4]) #endif } -inline void make_utf16le (const u32x in[4], u32x out1[4], u32x out2[4]) +void make_utf16le (const u32x in[4], u32x out1[4], u32x out2[4]) { #ifdef IS_NV out2[3] = __byte_perm (in[3], 0, 0x7372); @@ -2051,7 +2051,7 @@ inline void make_utf16le (const u32x in[4], u32x out1[4], u32x out2[4]) #endif } -inline void undo_utf16be (const u32x in1[4], const u32x in2[4], u32x out[4]) +void undo_utf16be (const u32x in1[4], const u32x in2[4], u32x out[4]) { #ifdef IS_NV out[0] = __byte_perm (in1[0], in1[1], 0x4602); @@ -2072,7 +2072,7 @@ inline void undo_utf16be (const u32x in1[4], const u32x in2[4], u32x out[4]) #endif } -inline void undo_utf16le (const u32x in1[4], const u32x in2[4], u32x out[4]) +void undo_utf16le (const u32x in1[4], const u32x in2[4], u32x out[4]) { #ifdef IS_NV out[0] = __byte_perm (in1[0], in1[1], 0x6420); @@ -2093,7 +2093,7 @@ inline void undo_utf16le (const u32x in1[4], const u32x in2[4], u32x out[4]) #endif } -inline void append_0x80_1x4 (u32x w0[4], const u32 offset) +void append_0x80_1x4 (u32x w0[4], const u32 offset) { const u32 tmp = 0x80 << ((offset & 3) * 8); @@ -2103,7 +2103,7 @@ inline void append_0x80_1x4 (u32x w0[4], const u32 offset) w0[3] |= (offset >= 12) ? tmp : 0; } -inline void append_0x80_2x4 (u32x w0[4], u32x w1[4], const u32 offset) +void append_0x80_2x4 (u32x w0[4], u32x w1[4], const u32 offset) { const u32 tmp = 0x80 << ((offset & 3) * 8); @@ -2117,7 +2117,7 @@ inline void append_0x80_2x4 (u32x w0[4], u32x w1[4], const u32 offset) w1[3] |= (offset >= 28) ? tmp : 0; } -inline void append_0x80_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset) +void append_0x80_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset) { const u32 tmp = 0x80 << ((offset & 3) * 8); @@ -2135,7 +2135,7 @@ inline void append_0x80_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offse w2[3] |= (offset >= 44) ? tmp : 0; } -inline void append_0x80_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) +void append_0x80_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) { const u32 tmp = 0x80 << ((offset & 3) * 8); @@ -2157,7 +2157,7 @@ inline void append_0x80_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], con w3[3] |= (offset >= 60) ? tmp : 0; } -inline void append_0x80_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) +void append_0x80_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) { switch (offset) { @@ -2675,7 +2675,7 @@ inline void append_0x80_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32 } } -inline void append_0x80_1x16 (u32x w[16], const u32 offset) +void append_0x80_1x16 (u32x w[16], const u32 offset) { switch (offset) { @@ -2937,7 +2937,7 @@ inline void append_0x80_1x16 (u32x w[16], const u32 offset) } } -inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) +void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; @@ -3798,7 +3798,7 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x #endif } -inline void switch_buffer_by_offset_carry_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], const u32 offset) +void switch_buffer_by_offset_carry_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], const u32 offset) { const int offset_mod_4 = offset & 3; @@ -4600,7 +4600,7 @@ inline void switch_buffer_by_offset_carry_le (u32x w0[4], u32x w1[4], u32x w2[4] } } -inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) +void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) @@ -5255,7 +5255,7 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x #endif } -inline void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], const u32 offset) +void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) @@ -6182,7 +6182,7 @@ inline void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4] #endif } -inline void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) +void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; @@ -7795,7 +7795,7 @@ inline void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], #endif } -inline void switch_buffer_by_offset_8x4_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) +void switch_buffer_by_offset_8x4_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) @@ -10114,7 +10114,7 @@ inline void switch_buffer_by_offset_8x4_be (u32x w0[4], u32x w1[4], u32x w2[4], #endif } -inline void switch_buffer_by_offset_8x4_carry_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], u32x c4[4], u32x c5[4], u32x c6[4], u32x c7[4], const u32 offset) +void switch_buffer_by_offset_8x4_carry_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], u32x c4[4], u32x c5[4], u32x c6[4], u32x c7[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) @@ -13489,7 +13489,7 @@ inline void switch_buffer_by_offset_8x4_carry_be (u32x w0[4], u32x w1[4], u32x w #endif } -inline void overwrite_at_le (u32x sw[16], const u32x w0, const u32 salt_len) +void overwrite_at_le (u32x sw[16], const u32x w0, const u32 salt_len) { #if defined cl_amd_media_ops switch (salt_len) @@ -13678,7 +13678,7 @@ inline void overwrite_at_le (u32x sw[16], const u32x w0, const u32 salt_len) #endif } -inline void overwrite_at_be (u32x sw[16], const u32x w0, const u32 salt_len) +void overwrite_at_be (u32x sw[16], const u32x w0, const u32 salt_len) { // would be nice to have optimization based on amd_bytealign as with _le counterpart @@ -13775,7 +13775,7 @@ inline void overwrite_at_be (u32x sw[16], const u32x w0, const u32 salt_len) } } -inline void overwrite_at_le_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x wx, const u32 salt_len) +void overwrite_at_le_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x wx, const u32 salt_len) { #if defined cl_amd_media_ops switch (salt_len) @@ -14140,7 +14140,7 @@ inline void overwrite_at_le_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], #endif } -inline void overwrite_at_be_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x wx, const u32 salt_len) +void overwrite_at_be_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x wx, const u32 salt_len) { // would be nice to have optimization based on amd_bytealign as with _le counterpart @@ -14329,7 +14329,7 @@ inline void overwrite_at_be_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], * vector functions as scalar (for outer loop usage) */ -inline void append_0x01_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) +void append_0x01_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) { const u32 tmp = 0x01 << ((offset & 3) * 8); @@ -14343,7 +14343,7 @@ inline void append_0x01_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) w1[3] |= (offset >= 28) ? tmp : 0; } -inline void append_0x80_1x4_S (u32 w0[4], const u32 offset) +void append_0x80_1x4_S (u32 w0[4], const u32 offset) { const u32 tmp = 0x80 << ((offset & 3) * 8); @@ -14353,7 +14353,7 @@ inline void append_0x80_1x4_S (u32 w0[4], const u32 offset) w0[3] |= (offset >= 12) ? tmp : 0; } -inline void append_0x80_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) +void append_0x80_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) { const u32 tmp = 0x80 << ((offset & 3) * 8); @@ -14367,7 +14367,7 @@ inline void append_0x80_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) w1[3] |= (offset >= 28) ? tmp : 0; } -inline void append_0x80_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) +void append_0x80_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) { const u32 tmp = 0x80 << ((offset & 3) * 8); @@ -14385,7 +14385,7 @@ inline void append_0x80_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset w2[3] |= (offset >= 44) ? tmp : 0; } -inline void append_0x80_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) +void append_0x80_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) { const u32 tmp = 0x80 << ((offset & 3) * 8); @@ -14407,7 +14407,7 @@ inline void append_0x80_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const w3[3] |= (offset >= 60) ? tmp : 0; } -inline void append_0x80_8x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) +void append_0x80_8x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) { switch (offset) { @@ -14925,7 +14925,7 @@ inline void append_0x80_8x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w } } -inline void make_utf16be_S (const u32 in[4], u32 out1[4], u32 out2[4]) +void make_utf16be_S (const u32 in[4], u32 out1[4], u32 out2[4]) { #ifdef IS_NV out2[3] = __byte_perm_S (in[3], 0, 0x3727); @@ -14950,7 +14950,7 @@ inline void make_utf16be_S (const u32 in[4], u32 out1[4], u32 out2[4]) #endif } -inline void make_utf16beN_S (const u32 in[4], u32 out1[4], u32 out2[4]) +void make_utf16beN_S (const u32 in[4], u32 out1[4], u32 out2[4]) { #ifdef IS_NV out2[3] = __byte_perm_S (in[3], 0, 0x1707); @@ -14975,7 +14975,7 @@ inline void make_utf16beN_S (const u32 in[4], u32 out1[4], u32 out2[4]) #endif } -inline void make_utf16le_S (const u32 in[4], u32 out1[4], u32 out2[4]) +void make_utf16le_S (const u32 in[4], u32 out1[4], u32 out2[4]) { #ifdef IS_NV out2[3] = __byte_perm_S (in[3], 0, 0x7372); @@ -15000,7 +15000,7 @@ inline void make_utf16le_S (const u32 in[4], u32 out1[4], u32 out2[4]) #endif } -inline void undo_utf16be_S (const u32 in1[4], const u32 in2[4], u32 out[4]) +void undo_utf16be_S (const u32 in1[4], const u32 in2[4], u32 out[4]) { #ifdef IS_NV out[0] = __byte_perm_S (in1[0], in1[1], 0x4602); @@ -15021,7 +15021,7 @@ inline void undo_utf16be_S (const u32 in1[4], const u32 in2[4], u32 out[4]) #endif } -inline void undo_utf16le_S (const u32 in1[4], const u32 in2[4], u32 out[4]) +void undo_utf16le_S (const u32 in1[4], const u32 in2[4], u32 out[4]) { #ifdef IS_NV out[0] = __byte_perm_S (in1[0], in1[1], 0x6420); @@ -15042,7 +15042,7 @@ inline void undo_utf16le_S (const u32 in1[4], const u32 in2[4], u32 out[4]) #endif } -inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) +void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; @@ -15903,7 +15903,7 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w #endif } -inline void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], const u32 offset) +void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], const u32 offset) { const int offset_mod_4 = offset & 3; @@ -16705,7 +16705,7 @@ inline void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4], } } -inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) +void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) @@ -17360,7 +17360,7 @@ inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w #endif } -inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], const u32 offset) +void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) @@ -18287,7 +18287,7 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], #endif } -inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) +void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; @@ -19900,7 +19900,7 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u #endif } -inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) +void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) @@ -22219,7 +22219,7 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u #endif } -inline void switch_buffer_by_offset_8x4_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], u32 c4[4], u32 c5[4], u32 c6[4], u32 c7[4], const u32 offset) +void switch_buffer_by_offset_8x4_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], u32 c4[4], u32 c5[4], u32 c6[4], u32 c7[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) @@ -25594,7 +25594,7 @@ inline void switch_buffer_by_offset_8x4_carry_be_S (u32 w0[4], u32 w1[4], u32 w2 #endif } -inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) +void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; @@ -36655,7 +36655,7 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) #endif } -inline void switch_buffer_by_offset_1x64_be_S (u32 w[64], const u32 offset) +void switch_buffer_by_offset_1x64_be_S (u32 w[64], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) @@ -45438,7 +45438,7 @@ inline void switch_buffer_by_offset_1x64_be_S (u32 w[64], const u32 offset) PACKSV4 (s6, v6, e); \ PACKSV4 (s7, v7, e); -inline void switch_buffer_by_offset_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x offset) +void switch_buffer_by_offset_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x offset) { #if VECT_SIZE == 1 @@ -45498,7 +45498,7 @@ inline void switch_buffer_by_offset_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u #endif } -inline void switch_buffer_by_offset_8x4_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32x offset) +void switch_buffer_by_offset_8x4_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32x offset) { #if VECT_SIZE == 1 @@ -45678,7 +45678,7 @@ inline void switch_buffer_by_offset_8x4_le_VV (u32x w0[4], u32x w1[4], u32x w2[4 #endif } -inline void append_0x01_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset) +void append_0x01_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset) { #if VECT_SIZE == 1 @@ -45736,7 +45736,7 @@ inline void append_0x01_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset) #endif } -inline void append_0x80_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset) +void append_0x80_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset) { #if VECT_SIZE == 1 @@ -45794,7 +45794,7 @@ inline void append_0x80_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset) #endif } -inline void append_0x80_4x4_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x offset) +void append_0x80_4x4_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x offset) { #if VECT_SIZE == 1 diff --git a/OpenCL/inc_hash_md4.cl b/OpenCL/inc_hash_md4.cl index 668d0bbc9..a9383a5da 100644 --- a/OpenCL/inc_hash_md4.cl +++ b/OpenCL/inc_hash_md4.cl @@ -111,7 +111,7 @@ void md4_init (md4_ctx_t *ctx) void md4_update_64 (md4_ctx_t *ctx, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif @@ -1234,7 +1234,7 @@ void md4_init_vector_from_scalar (md4_ctx_vector_t *ctx, md4_ctx_t *ctx0) void md4_update_vector_64 (md4_ctx_vector_t *ctx, u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif diff --git a/OpenCL/inc_hash_md5.cl b/OpenCL/inc_hash_md5.cl index 926bbb2c4..95e06cbef 100644 --- a/OpenCL/inc_hash_md5.cl +++ b/OpenCL/inc_hash_md5.cl @@ -145,7 +145,7 @@ void md5_init (md5_ctx_t *ctx) void md5_update_64 (md5_ctx_t *ctx, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif @@ -1303,7 +1303,7 @@ void md5_init_vector_from_scalar (md5_ctx_vector_t *ctx, md5_ctx_t *ctx0) void md5_update_vector_64 (md5_ctx_vector_t *ctx, u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif diff --git a/OpenCL/inc_hash_ripemd160.cl b/OpenCL/inc_hash_ripemd160.cl index bf5d2ec42..709ad3eb2 100644 --- a/OpenCL/inc_hash_ripemd160.cl +++ b/OpenCL/inc_hash_ripemd160.cl @@ -245,7 +245,7 @@ void ripemd160_init (ripemd160_ctx_t *ctx) void ripemd160_update_64 (ripemd160_ctx_t *ctx, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif @@ -1504,7 +1504,7 @@ void ripemd160_init_vector_from_scalar (ripemd160_ctx_vector_t *ctx, ripemd160_c void ripemd160_update_vector_64 (ripemd160_ctx_vector_t *ctx, u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif diff --git a/OpenCL/inc_hash_sha1.cl b/OpenCL/inc_hash_sha1.cl index 9713a02dd..47fe4691d 100644 --- a/OpenCL/inc_hash_sha1.cl +++ b/OpenCL/inc_hash_sha1.cl @@ -177,7 +177,7 @@ void sha1_init (sha1_ctx_t *ctx) void sha1_update_64 (sha1_ctx_t *ctx, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif @@ -1368,7 +1368,7 @@ void sha1_init_vector_from_scalar (sha1_ctx_vector_t *ctx, sha1_ctx_t *ctx0) void sha1_update_vector_64 (sha1_ctx_vector_t *ctx, u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif diff --git a/OpenCL/inc_hash_sha224.cl b/OpenCL/inc_hash_sha224.cl index 4f35938a6..553397f6c 100644 --- a/OpenCL/inc_hash_sha224.cl +++ b/OpenCL/inc_hash_sha224.cl @@ -162,7 +162,7 @@ void sha224_init (sha224_ctx_t *ctx) void sha224_update_64 (sha224_ctx_t *ctx, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif @@ -1321,7 +1321,7 @@ void sha224_init_vector_from_scalar (sha224_ctx_vector_t *ctx, sha224_ctx_t *ctx void sha224_update_vector_64 (sha224_ctx_vector_t *ctx, u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif diff --git a/OpenCL/inc_hash_sha256.cl b/OpenCL/inc_hash_sha256.cl index 75fd99acf..92b35b579 100644 --- a/OpenCL/inc_hash_sha256.cl +++ b/OpenCL/inc_hash_sha256.cl @@ -162,7 +162,7 @@ void sha256_init (sha256_ctx_t *ctx) void sha256_update_64 (sha256_ctx_t *ctx, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif @@ -1321,7 +1321,7 @@ void sha256_init_vector_from_scalar (sha256_ctx_vector_t *ctx, sha256_ctx_t *ctx void sha256_update_vector_64 (sha256_ctx_vector_t *ctx, u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif diff --git a/OpenCL/inc_hash_sha384.cl b/OpenCL/inc_hash_sha384.cl index 8302cd379..0800b253a 100644 --- a/OpenCL/inc_hash_sha384.cl +++ b/OpenCL/inc_hash_sha384.cl @@ -186,7 +186,7 @@ void sha384_init (sha384_ctx_t *ctx) void sha384_update_128 (sha384_ctx_t *ctx, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 127; + const int pos = ctx->len & 127; #else const int pos = ctx->len & 127; #endif @@ -2017,7 +2017,7 @@ void sha384_init_vector_from_scalar (sha384_ctx_vector_t *ctx, sha384_ctx_t *ctx void sha384_update_vector_128 (sha384_ctx_vector_t *ctx, u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 127; + const int pos = ctx->len & 127; #else const int pos = ctx->len & 127; #endif diff --git a/OpenCL/inc_hash_sha512.cl b/OpenCL/inc_hash_sha512.cl index 6c58834eb..61c6e143d 100644 --- a/OpenCL/inc_hash_sha512.cl +++ b/OpenCL/inc_hash_sha512.cl @@ -186,7 +186,7 @@ void sha512_init (sha512_ctx_t *ctx) void sha512_update_128 (sha512_ctx_t *ctx, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 127; + const int pos = ctx->len & 127; #else const int pos = ctx->len & 127; #endif @@ -2017,7 +2017,7 @@ void sha512_init_vector_from_scalar (sha512_ctx_vector_t *ctx, sha512_ctx_t *ctx void sha512_update_vector_128 (sha512_ctx_vector_t *ctx, u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 127; + const int pos = ctx->len & 127; #else const int pos = ctx->len & 127; #endif diff --git a/OpenCL/inc_hash_whirlpool.cl b/OpenCL/inc_hash_whirlpool.cl index a983cefb7..1ec270105 100644 --- a/OpenCL/inc_hash_whirlpool.cl +++ b/OpenCL/inc_hash_whirlpool.cl @@ -1345,7 +1345,7 @@ void whirlpool_init (whirlpool_ctx_t *ctx, __local u32 (*s_Ch)[256], __local u32 void whirlpool_update_64 (whirlpool_ctx_t *ctx, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif @@ -2608,7 +2608,7 @@ void whirlpool_init_vector_from_scalar (whirlpool_ctx_vector_t *ctx, whirlpool_c void whirlpool_update_vector_64 (whirlpool_ctx_vector_t *ctx, u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif diff --git a/OpenCL/inc_rp.cl b/OpenCL/inc_rp.cl index 71c926d92..c50ec4a67 100644 --- a/OpenCL/inc_rp.cl +++ b/OpenCL/inc_rp.cl @@ -3,7 +3,7 @@ * License.....: MIT */ -inline u32 generate_cmask (const u32 value) +u32 generate_cmask (const u32 value) { const u32 rmask = ((value & 0x40404040u) >> 1u) & ~((value & 0x80808080u) >> 2u); @@ -14,7 +14,7 @@ inline u32 generate_cmask (const u32 value) return rmask & ~hmask & lmask; } -inline void truncate_right (u32 buf0[4], u32 buf1[4], const u32 offset) +void truncate_right (u32 buf0[4], u32 buf1[4], const u32 offset) { const u32 tmp = (1u << ((offset & 3u) * 8u)) - 1u; @@ -67,7 +67,7 @@ inline void truncate_right (u32 buf0[4], u32 buf1[4], const u32 offset) } } -inline void truncate_left (u32 buf0[4], u32 buf1[4], const u32 offset) +void truncate_left (u32 buf0[4], u32 buf1[4], const u32 offset) { const u32 tmp = ~((1u << ((offset & 3u) * 8u)) - 1u); @@ -120,7 +120,7 @@ inline void truncate_left (u32 buf0[4], u32 buf1[4], const u32 offset) } } -inline void lshift_block (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4]) +void lshift_block (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4]) { out0[0] = amd_bytealign_S (in0[1], in0[0], 1); out0[1] = amd_bytealign_S (in0[2], in0[1], 1); @@ -132,7 +132,7 @@ inline void lshift_block (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 o out1[3] = amd_bytealign_S ( 0, in1[3], 1); } -inline void rshift_block (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4]) +void rshift_block (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4]) { out1[3] = amd_bytealign_S (in1[3], in1[2], 3); out1[2] = amd_bytealign_S (in1[2], in1[1], 3); @@ -144,7 +144,7 @@ inline void rshift_block (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 o out0[0] = amd_bytealign_S (in0[0], 0, 3); } -inline void lshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4], const u32 num) +void lshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4], const u32 num) { switch (num) { @@ -439,7 +439,7 @@ inline void lshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 } } -inline void rshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4], const u32 num) +void rshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4], const u32 num) { switch (num) { @@ -734,7 +734,7 @@ inline void rshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 } } -inline void append_block1 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_r0) +void append_block1 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_r0) { // this version works with 1 byte append only @@ -754,7 +754,7 @@ inline void append_block1 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 buf1[3] |= (offset >= 28) ? tmp : 0; } -inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0[4], const u32 src_l1[4], const u32 src_r0[4], const u32 src_r1[4]) +void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0[4], const u32 src_l1[4], const u32 src_r0[4], const u32 src_r1[4]) { #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; @@ -1012,7 +1012,7 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 #endif } -inline void reverse_block (u32 in0[4], u32 in1[4], u32 out0[4], u32 out1[4], const u32 len) +void reverse_block (u32 in0[4], u32 in1[4], u32 out0[4], u32 out1[4], const u32 len) { rshift_block_N (in0, in1, out0, out1, 32 - len); @@ -1038,7 +1038,7 @@ inline void reverse_block (u32 in0[4], u32 in1[4], u32 out0[4], u32 out1[4], con out1[3] = swap32_S (tib41[3]); } -inline u32 rule_op_mangle_lrest (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_lrest (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { buf0[0] |= (generate_cmask (buf0[0])); buf0[1] |= (generate_cmask (buf0[1])); @@ -1052,7 +1052,7 @@ inline u32 rule_op_mangle_lrest (const u32 p0, const u32 p1, u32 buf0[4], u32 bu return in_len; } -inline u32 rule_op_mangle_urest (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_urest (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { buf0[0] &= ~(generate_cmask (buf0[0])); buf0[1] &= ~(generate_cmask (buf0[1])); @@ -1066,7 +1066,7 @@ inline u32 rule_op_mangle_urest (const u32 p0, const u32 p1, u32 buf0[4], u32 bu return in_len; } -inline u32 rule_op_mangle_lrest_ufirst (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_lrest_ufirst (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { rule_op_mangle_lrest (p0, p1, buf0, buf1, in_len); @@ -1075,7 +1075,7 @@ inline u32 rule_op_mangle_lrest_ufirst (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -inline u32 rule_op_mangle_urest_lfirst (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_urest_lfirst (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { rule_op_mangle_urest (p0, p1, buf0, buf1, in_len); @@ -1084,7 +1084,7 @@ inline u32 rule_op_mangle_urest_lfirst (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -inline u32 rule_op_mangle_trest (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_trest (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { buf0[0] ^= (generate_cmask (buf0[0])); buf0[1] ^= (generate_cmask (buf0[1])); @@ -1098,7 +1098,7 @@ inline u32 rule_op_mangle_trest (const u32 p0, const u32 p1, u32 buf0[4], u32 bu return in_len; } -inline u32 rule_op_mangle_toggle_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_toggle_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -1119,14 +1119,14 @@ inline u32 rule_op_mangle_toggle_at (const u32 p0, const u32 p1, u32 buf0[4], u3 return in_len; } -inline u32 rule_op_mangle_reverse (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_reverse (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { reverse_block (buf0, buf1, buf0, buf1, in_len); return in_len; } -inline u32 rule_op_mangle_dupeword (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_dupeword (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if ((in_len + in_len) >= 32) return (in_len); @@ -1139,7 +1139,7 @@ inline u32 rule_op_mangle_dupeword (const u32 p0, const u32 p1, u32 buf0[4], u32 return out_len; } -inline u32 rule_op_mangle_dupeword_times (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_dupeword_times (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (((in_len * p0) + in_len) >= 32) return (in_len); @@ -1167,7 +1167,7 @@ inline u32 rule_op_mangle_dupeword_times (const u32 p0, const u32 p1, u32 buf0[4 return out_len; } -inline u32 rule_op_mangle_reflect (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_reflect (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if ((in_len + in_len) >= 32) return (in_len); @@ -1185,7 +1185,7 @@ inline u32 rule_op_mangle_reflect (const u32 p0, const u32 p1, u32 buf0[4], u32 return out_len; } -inline u32 rule_op_mangle_append (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_append (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if ((in_len + 1) >= 32) return (in_len); @@ -1198,7 +1198,7 @@ inline u32 rule_op_mangle_append (const u32 p0, const u32 p1, u32 buf0[4], u32 b return out_len; } -inline u32 rule_op_mangle_prepend (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_prepend (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if ((in_len + 1) >= 32) return (in_len); @@ -1213,7 +1213,7 @@ inline u32 rule_op_mangle_prepend (const u32 p0, const u32 p1, u32 buf0[4], u32 return out_len; } -inline u32 rule_op_mangle_rotate_left (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_rotate_left (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (in_len == 0) return (in_len); @@ -1237,7 +1237,7 @@ inline u32 rule_op_mangle_rotate_left (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -inline u32 rule_op_mangle_rotate_right (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_rotate_right (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (in_len == 0) return (in_len); @@ -1267,7 +1267,7 @@ inline u32 rule_op_mangle_rotate_right (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -inline u32 rule_op_mangle_delete_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_delete_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (in_len == 0) return (in_len); @@ -1278,7 +1278,7 @@ inline u32 rule_op_mangle_delete_first (const u32 p0, const u32 p1, u32 buf0[4], return in_len1; } -inline u32 rule_op_mangle_delete_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_delete_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (in_len == 0) return (in_len); @@ -1298,7 +1298,7 @@ inline u32 rule_op_mangle_delete_last (const u32 p0, const u32 p1, u32 buf0[4], return in_len1; } -inline u32 rule_op_mangle_delete_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_delete_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -1373,7 +1373,7 @@ inline u32 rule_op_mangle_delete_at (const u32 p0, const u32 p1, u32 buf0[4], u3 return out_len; } -inline u32 rule_op_mangle_extract (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_extract (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -1388,7 +1388,7 @@ inline u32 rule_op_mangle_extract (const u32 p0, const u32 p1, u32 buf0[4], u32 return out_len; } -inline u32 rule_op_mangle_omit (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_omit (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -1474,7 +1474,7 @@ inline u32 rule_op_mangle_omit (const u32 p0, const u32 p1, u32 buf0[4], u32 buf return out_len; } -inline u32 rule_op_mangle_insert (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_insert (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 > in_len) return (in_len); @@ -1546,7 +1546,7 @@ inline u32 rule_op_mangle_insert (const u32 p0, const u32 p1, u32 buf0[4], u32 b return out_len; } -inline u32 rule_op_mangle_overstrike (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_overstrike (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -1569,7 +1569,7 @@ inline u32 rule_op_mangle_overstrike (const u32 p0, const u32 p1, u32 buf0[4], u return in_len; } -inline u32 rule_op_mangle_truncate_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_truncate_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -1578,7 +1578,7 @@ inline u32 rule_op_mangle_truncate_at (const u32 p0, const u32 p1, u32 buf0[4], return p0; } -inline u32 rule_op_mangle_replace (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_replace (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { const uchar4 tmp0 = (uchar4) (p0); const uchar4 tmp1 = (uchar4) (p1); @@ -1597,7 +1597,7 @@ inline u32 rule_op_mangle_replace (const u32 p0, const u32 p1, u32 buf0[4], u32 return in_len; } -inline u32 rule_op_mangle_purgechar (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_purgechar (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { u32 out_len = 0; @@ -1638,13 +1638,13 @@ inline u32 rule_op_mangle_purgechar (const u32 p0, const u32 p1, u32 buf0[4], u3 return out_len; } -inline u32 rule_op_mangle_togglecase_rec (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_togglecase_rec (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { // TODO return in_len; } -inline u32 rule_op_mangle_dupechar_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_dupechar_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if ( in_len == 0) return (in_len); if ((in_len + p0) >= 32) return (in_len); @@ -1831,7 +1831,7 @@ inline u32 rule_op_mangle_dupechar_first (const u32 p0, const u32 p1, u32 buf0[4 return out_len; } -inline u32 rule_op_mangle_dupechar_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_dupechar_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if ( in_len == 0) return (in_len); if ((in_len + p0) >= 32) return (in_len); @@ -1865,7 +1865,7 @@ inline u32 rule_op_mangle_dupechar_last (const u32 p0, const u32 p1, u32 buf0[4] return out_len; } -inline u32 rule_op_mangle_dupechar_all (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_dupechar_all (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if ( in_len == 0) return (in_len); if ((in_len + in_len) >= 32) return (in_len); @@ -1898,7 +1898,7 @@ inline u32 rule_op_mangle_dupechar_all (const u32 p0, const u32 p1, u32 buf0[4], return out_len; } -inline u32 rule_op_mangle_switch_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_switch_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (in_len < 2) return (in_len); @@ -1907,7 +1907,7 @@ inline u32 rule_op_mangle_switch_first (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -inline u32 rule_op_mangle_switch_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_switch_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (in_len < 2) return (in_len); @@ -1992,7 +1992,7 @@ inline u32 rule_op_mangle_switch_last (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -inline u32 rule_op_mangle_switch_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_switch_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); if (p1 >= in_len) return (in_len); @@ -2239,7 +2239,7 @@ inline u32 rule_op_mangle_switch_at (const u32 p0, const u32 p1, u32 buf0[4], u3 return in_len; } -inline u32 rule_op_mangle_chr_shiftl (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_chr_shiftl (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -2261,7 +2261,7 @@ inline u32 rule_op_mangle_chr_shiftl (const u32 p0, const u32 p1, u32 buf0[4], u return in_len; } -inline u32 rule_op_mangle_chr_shiftr (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_chr_shiftr (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -2283,7 +2283,7 @@ inline u32 rule_op_mangle_chr_shiftr (const u32 p0, const u32 p1, u32 buf0[4], u return in_len; } -inline u32 rule_op_mangle_chr_incr (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_chr_incr (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -2307,7 +2307,7 @@ inline u32 rule_op_mangle_chr_incr (const u32 p0, const u32 p1, u32 buf0[4], u32 return in_len; } -inline u32 rule_op_mangle_chr_decr (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_chr_decr (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -2331,7 +2331,7 @@ inline u32 rule_op_mangle_chr_decr (const u32 p0, const u32 p1, u32 buf0[4], u32 return in_len; } -inline u32 rule_op_mangle_replace_np1 (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_replace_np1 (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if ((p0 + 1) >= in_len) return (in_len); @@ -2358,7 +2358,7 @@ inline u32 rule_op_mangle_replace_np1 (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -inline u32 rule_op_mangle_replace_nm1 (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_replace_nm1 (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 == 0) return (in_len); @@ -2387,7 +2387,7 @@ inline u32 rule_op_mangle_replace_nm1 (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -inline u32 rule_op_mangle_dupeblock_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_dupeblock_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 > in_len) return (in_len); @@ -2425,7 +2425,7 @@ inline u32 rule_op_mangle_dupeblock_first (const u32 p0, const u32 p1, u32 buf0[ return out_len; } -inline u32 rule_op_mangle_dupeblock_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_dupeblock_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 > in_len) return (in_len); @@ -2454,7 +2454,7 @@ inline u32 rule_op_mangle_dupeblock_last (const u32 p0, const u32 p1, u32 buf0[4 return out_len; } -inline u32 rule_op_mangle_title_sep (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_title_sep (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { buf0[0] |= (generate_cmask (buf0[0])); buf0[1] |= (generate_cmask (buf0[1])); @@ -2497,7 +2497,7 @@ inline u32 rule_op_mangle_title_sep (const u32 p0, const u32 p1, u32 buf0[4], u3 return in_len; } -inline u32 apply_rule (const u32 name, const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 apply_rule (const u32 name, const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { u32 out_len = in_len; @@ -2549,7 +2549,7 @@ inline u32 apply_rule (const u32 name, const u32 p0, const u32 p1, u32 buf0[4], return out_len; } -inline u32 apply_rules (__global const u32 *cmds, u32 buf0[4], u32 buf1[4], const u32 len) +u32 apply_rules (__global const u32 *cmds, u32 buf0[4], u32 buf1[4], const u32 len) { u32 out_len = len; @@ -2567,7 +2567,7 @@ inline u32 apply_rules (__global const u32 *cmds, u32 buf0[4], u32 buf1[4], cons return out_len; } -inline u32x apply_rules_vect (const u32 pw_buf0[4], const u32 pw_buf1[4], const u32 pw_len, __global const kernel_rule_t *rules_buf, const u32 il_pos, u32x buf0[4], u32x buf1[4]) +u32x apply_rules_vect (const u32 pw_buf0[4], const u32 pw_buf1[4], const u32 pw_len, __global const kernel_rule_t *rules_buf, const u32 il_pos, u32x buf0[4], u32x buf1[4]) { #if VECT_SIZE == 1 diff --git a/OpenCL/inc_simd.cl b/OpenCL/inc_simd.cl index 37548b44c..ac9f0410f 100644 --- a/OpenCL/inc_simd.cl +++ b/OpenCL/inc_simd.cl @@ -1054,7 +1054,7 @@ // attack-mode 0 -inline u32x ix_create_bft (__global const bf_t *bfs_buf, const u32 il_pos) +u32x ix_create_bft (__global const bf_t *bfs_buf, const u32 il_pos) { #if VECT_SIZE == 1 const u32x ix = (u32x) (bfs_buf[il_pos + 0].i); @@ -1073,7 +1073,7 @@ inline u32x ix_create_bft (__global const bf_t *bfs_buf, const u32 il_pos) // attack-mode 1 -inline u32x pwlenx_create_combt (__global const pw_t *combs_buf, const u32 il_pos) +u32x pwlenx_create_combt (__global const pw_t *combs_buf, const u32 il_pos) { #if VECT_SIZE == 1 const u32x pw_lenx = (u32x) (combs_buf[il_pos + 0].pw_len); @@ -1090,7 +1090,7 @@ inline u32x pwlenx_create_combt (__global const pw_t *combs_buf, const u32 il_po return pw_lenx; } -inline u32x ix_create_combt (__global const pw_t *combs_buf, const u32 il_pos, const int idx) +u32x ix_create_combt (__global const pw_t *combs_buf, const u32 il_pos, const int idx) { #if VECT_SIZE == 1 const u32x ix = (u32x) (combs_buf[il_pos + 0].i[idx]); diff --git a/OpenCL/inc_types.cl b/OpenCL/inc_types.cl index 2bfb641df..78ecd9988 100644 --- a/OpenCL/inc_types.cl +++ b/OpenCL/inc_types.cl @@ -33,14 +33,14 @@ typedef VTYPE(uint, VECT_SIZE) u32x; typedef VTYPE(ulong, VECT_SIZE) u64x; #endif -inline u32 l32_from_64_S (u64 a) +u32 l32_from_64_S (u64 a) { const u32 r = (u32) (a); return r; } -inline u32 h32_from_64_S (u64 a) +u32 h32_from_64_S (u64 a) { a >>= 32; @@ -49,12 +49,12 @@ inline u32 h32_from_64_S (u64 a) return r; } -inline u64 hl32_to_64_S (const u32 a, const u32 b) +u64 hl32_to_64_S (const u32 a, const u32 b) { return as_ulong ((uint2) (b, a)); } -inline u32x l32_from_64 (u64x a) +u32x l32_from_64 (u64x a) { u32x r; @@ -93,7 +93,7 @@ inline u32x l32_from_64 (u64x a) return r; } -inline u32x h32_from_64 (u64x a) +u32x h32_from_64 (u64x a) { a >>= 32; @@ -134,7 +134,7 @@ inline u32x h32_from_64 (u64x a) return r; } -inline u64x hl32_to_64 (const u32x a, const u32x b) +u64x hl32_to_64 (const u32x a, const u32x b) { u64x r; @@ -174,45 +174,37 @@ inline u64x hl32_to_64 (const u32x a, const u32x b) } #ifdef IS_AMD -inline u32 swap32_S (const u32 v) +u32 swap32_S (const u32 v) { return (as_uint (as_uchar4 (v).s3210)); } -inline u64 swap64_S (const u64 v) +u64 swap64_S (const u64 v) { return (as_ulong (as_uchar8 (v).s76543210)); } -inline u32 rotr32_S (const u32 a, const u32 n) +u32 rotr32_S (const u32 a, const u32 n) { return rotate (a, 32 - n); } -inline u32 rotl32_S (const u32 a, const u32 n) +u32 rotl32_S (const u32 a, const u32 n) { return rotate (a, n); } -inline u64 rotr64_S (const u64 a, const u32 n) +u64 rotr64_S (const u64 a, const u32 n) { - const u32 a0 = h32_from_64_S (a); - const u32 a1 = l32_from_64_S (a); - - const u32 t0 = (n >= 32) ? amd_bitalign (a0, a1, n - 32) : amd_bitalign (a1, a0, n); - const u32 t1 = (n >= 32) ? amd_bitalign (a1, a0, n - 32) : amd_bitalign (a0, a1, n); - - const u64 r = hl32_to_64_S (t0, t1); - - return r; + return rotate (a, (u64) (64 - n)); } -inline u64 rotl64_S (const u64 a, const u32 n) +u64 rotl64_S (const u64 a, const u32 n) { - return rotr64_S (a, 64 - n); + return rotate (a, (u64) n); } -inline u32x swap32 (const u32x v) +u32x swap32 (const u32x v) { return ((v >> 24) & 0x000000ff) | ((v >> 8) & 0x0000ff00) @@ -220,7 +212,7 @@ inline u32x swap32 (const u32x v) | ((v << 24) & 0xff000000); } -inline u64x swap64 (const u64x v) +u64x swap64 (const u64x v) { return ((v >> 56) & 0x00000000000000ff) | ((v >> 40) & 0x000000000000ff00) @@ -232,82 +224,74 @@ inline u64x swap64 (const u64x v) | ((v << 56) & 0xff00000000000000); } -inline u32x rotr32 (const u32x a, const u32 n) +u32x rotr32 (const u32x a, const u32 n) { return rotate (a, 32 - n); } -inline u32x rotl32 (const u32x a, const u32 n) +u32x rotl32 (const u32x a, const u32 n) { return rotate (a, n); } -inline u64x rotr64 (const u64x a, const u32 n) +u64x rotr64 (const u64x a, const u32 n) { - const u32x a0 = h32_from_64 (a); - const u32x a1 = l32_from_64 (a); - - const u32x t0 = (n >= 32) ? amd_bitalign (a0, a1, n - 32) : amd_bitalign (a1, a0, n); - const u32x t1 = (n >= 32) ? amd_bitalign (a1, a0, n - 32) : amd_bitalign (a0, a1, n); - - const u64x r = hl32_to_64 (t0, t1); - - return r; + return rotate (a, (u64x) (64 - n)); } -inline u64x rotl64 (const u64x a, const u32 n) +u64x rotl64 (const u64x a, const u32 n) { - return rotr64 (a, 64 - n); + return rotate (a, (u64x) n); } -inline u32x __bfe (const u32x a, const u32x b, const u32x c) +u32x __bfe (const u32x a, const u32x b, const u32x c) { return amd_bfe (a, b, c); } -inline u32 __bfe_S (const u32 a, const u32 b, const u32 c) +u32 __bfe_S (const u32 a, const u32 b, const u32 c) { return amd_bfe (a, b, c); } -inline u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) +u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) { return amd_bytealign (a, b, c); } #endif #ifdef IS_NV -inline u32 swap32_S (const u32 v) +u32 swap32_S (const u32 v) { return (as_uint (as_uchar4 (v).s3210)); } -inline u64 swap64_S (const u64 v) +u64 swap64_S (const u64 v) { return (as_ulong (as_uchar8 (v).s76543210)); } -inline u32 rotr32_S (const u32 a, const u32 n) +u32 rotr32_S (const u32 a, const u32 n) { return rotate (a, 32 - n); } -inline u32 rotl32_S (const u32 a, const u32 n) +u32 rotl32_S (const u32 a, const u32 n) { return rotate (a, n); } -inline u64 rotr64_S (const u64 a, const u32 n) +u64 rotr64_S (const u64 a, const u32 n) { return rotate (a, (u64) 64 - n); } -inline u64 rotl64_S (const u64 a, const u32 n) +u64 rotl64_S (const u64 a, const u32 n) { return rotr64_S (a, 64 - n); } -inline u32x swap32 (const u32x v) +u32x swap32 (const u32x v) { return ((v >> 24) & 0x000000ff) | ((v >> 8) & 0x0000ff00) @@ -315,7 +299,7 @@ inline u32x swap32 (const u32x v) | ((v << 24) & 0xff000000); } -inline u64x swap64 (const u64x v) +u64x swap64 (const u64x v) { return ((v >> 56) & 0x00000000000000ff) | ((v >> 40) & 0x000000000000ff00) @@ -327,27 +311,27 @@ inline u64x swap64 (const u64x v) | ((v << 56) & 0xff00000000000000); } -inline u32x rotr32 (const u32x a, const u32 n) +u32x rotr32 (const u32x a, const u32 n) { return rotate (a, 32 - n); } -inline u32x rotl32 (const u32x a, const u32 n) +u32x rotl32 (const u32x a, const u32 n) { return rotate (a, n); } -inline u64x rotr64 (const u64x a, const u32 n) +u64x rotr64 (const u64x a, const u32 n) { return rotate (a, (u64) 64 - n); } -inline u64x rotl64 (const u64x a, const u32 n) +u64x rotl64 (const u64x a, const u32 n) { return rotate (a, (u64) n); } -inline u32x __byte_perm (const u32x a, const u32x b, const u32x c) +u32x __byte_perm (const u32x a, const u32x b, const u32x c) { u32x r; @@ -386,7 +370,7 @@ inline u32x __byte_perm (const u32x a, const u32x b, const u32x c) return r; } -inline u32 __byte_perm_S (const u32 a, const u32 b, const u32 c) +u32 __byte_perm_S (const u32 a, const u32 b, const u32 c) { u32 r; @@ -395,7 +379,7 @@ inline u32 __byte_perm_S (const u32 a, const u32 b, const u32 c) return r; } -inline u32x __bfe (const u32x a, const u32x b, const u32x c) +u32x __bfe (const u32x a, const u32x b, const u32x c) { u32x r; @@ -434,7 +418,7 @@ inline u32x __bfe (const u32x a, const u32x b, const u32x c) return r; } -inline u32 __bfe_S (const u32 a, const u32 b, const u32 c) +u32 __bfe_S (const u32 a, const u32 b, const u32 c) { u32 r; @@ -443,7 +427,7 @@ inline u32 __bfe_S (const u32 a, const u32 b, const u32 c) return r; } -inline u32x amd_bytealign (const u32x a, const u32x b, const u32x c) +u32x amd_bytealign (const u32x a, const u32x b, const u32x c) { u32x r; @@ -490,7 +474,7 @@ inline u32x amd_bytealign (const u32x a, const u32x b, const u32x c) return r; } -inline u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) +u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) { u32 r; @@ -509,37 +493,37 @@ inline u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) #endif #ifdef IS_GENERIC -inline u32 swap32_S (const u32 v) +u32 swap32_S (const u32 v) { return (as_uint (as_uchar4 (v).s3210)); } -inline u64 swap64_S (const u64 v) +u64 swap64_S (const u64 v) { return (as_ulong (as_uchar8 (v).s76543210)); } -inline u32 rotr32_S (const u32 a, const u32 n) +u32 rotr32_S (const u32 a, const u32 n) { return rotate (a, 32 - n); } -inline u32 rotl32_S (const u32 a, const u32 n) +u32 rotl32_S (const u32 a, const u32 n) { return rotate (a, n); } -inline u64 rotr64_S (const u64 a, const u32 n) +u64 rotr64_S (const u64 a, const u32 n) { return rotate (a, (u64) 64 - n); } -inline u64 rotl64_S (const u64 a, const u32 n) +u64 rotl64_S (const u64 a, const u32 n) { return rotate (a, (u64) n); } -inline u32x swap32 (const u32x v) +u32x swap32 (const u32x v) { return ((v >> 24) & 0x000000ff) | ((v >> 8) & 0x0000ff00) @@ -547,7 +531,7 @@ inline u32x swap32 (const u32x v) | ((v << 24) & 0xff000000); } -inline u64x swap64 (const u64x v) +u64x swap64 (const u64x v) { return ((v >> 56) & 0x00000000000000ff) | ((v >> 40) & 0x000000000000ff00) @@ -559,27 +543,27 @@ inline u64x swap64 (const u64x v) | ((v << 56) & 0xff00000000000000); } -inline u32x rotr32 (const u32x a, const u32 n) +u32x rotr32 (const u32x a, const u32 n) { return rotate (a, 32 - n); } -inline u32x rotl32 (const u32x a, const u32 n) +u32x rotl32 (const u32x a, const u32 n) { return rotate (a, n); } -inline u64x rotr64 (const u64x a, const u32 n) +u64x rotr64 (const u64x a, const u32 n) { return rotate (a, (u64) 64 - n); } -inline u64x rotl64 (const u64x a, const u32 n) +u64x rotl64 (const u64x a, const u32 n) { return rotate (a, (u64) n); } -inline u32x __bfe (const u32x a, const u32x b, const u32x c) +u32x __bfe (const u32x a, const u32x b, const u32x c) { #define BIT(x) ((u32x) (1u) << (x)) #define BIT_MASK(x) (BIT (x) - 1) @@ -592,7 +576,7 @@ inline u32x __bfe (const u32x a, const u32x b, const u32x c) #undef BFE } -inline u32 __bfe_S (const u32 a, const u32 b, const u32 c) +u32 __bfe_S (const u32 a, const u32 b, const u32 c) { #define BIT(x) (1u << (x)) #define BIT_MASK(x) (BIT (x) - 1) @@ -605,7 +589,7 @@ inline u32 __bfe_S (const u32 a, const u32 b, const u32 c) #undef BFE } -inline u32x amd_bytealign (const u32x a, const u32x b, const u32 c) +u32x amd_bytealign (const u32x a, const u32x b, const u32 c) { #if VECT_SIZE == 1 const u64x tmp = ((((u64x) (a)) << 32) | ((u64x) (b))) >> ((c & 3) * 8); @@ -638,7 +622,7 @@ inline u32x amd_bytealign (const u32x a, const u32x b, const u32 c) #endif } -inline u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) +u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) { const u64 tmp = ((((u64) a) << 32) | ((u64) b)) >> ((c & 3) * 8); diff --git a/OpenCL/inc_vendor.cl b/OpenCL/inc_vendor.cl index e990b0a31..a238286ea 100644 --- a/OpenCL/inc_vendor.cl +++ b/OpenCL/inc_vendor.cl @@ -153,9 +153,6 @@ #if KERN_TYPE == 13800 #undef _unroll #endif -#if KERN_TYPE == 14100 -#undef _unroll -#endif // nvidia specific @@ -177,6 +174,9 @@ #if KERN_TYPE == 14000 #undef _unroll #endif +#if KERN_TYPE == 14100 +#undef _unroll +#endif #endif #endif diff --git a/OpenCL/m01500_a3.cl b/OpenCL/m01500_a3.cl index f54007b40..e372251c4 100644 --- a/OpenCL/m01500_a3.cl +++ b/OpenCL/m01500_a3.cl @@ -14,7 +14,20 @@ #define COMPARE_S "inc_comp_single_bs.cl" #define COMPARE_M "inc_comp_multi_bs.cl" -#define myselx(a,b,c) ((c) ? (b) : (a)) +#ifdef IS_NV +#define KXX_DECL +#define sXXX_DECL +#endif + +#ifdef IS_AMD +#define KXX_DECL +#define sXXX_DECL +#endif + +#ifdef IS_GENERIC +#define KXX_DECL +#define sXXX_DECL +#endif #ifdef IS_NV @@ -888,11 +901,11 @@ void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, c #if defined IS_AMD || defined IS_GENERIC /* - * Bitslice DES S-boxes making use of a vector conditional select operation - * (e.g., vsel on PowerPC with AltiVec). + * Bitslice DES S-boxes for x86 with MMX/SSE2/AVX and for typical RISC + * architectures. These use AND, OR, XOR, NOT, and AND-NOT gates. * - * Gate counts: 36 33 33 26 35 34 34 32 - * Average: 32.875 + * Gate counts: 49 44 46 33 48 46 46 41 + * Average: 44.125 * * Several same-gate-count expressions for each S-box are included (for use on * different CPUs/GPUs). @@ -911,473 +924,561 @@ void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, c * The effort has been sponsored by Rapid7: http://www.rapid7.com */ -#define vnot(d,a) (d) = ~(a) -#define vor(d,a,b) (d) = (a) | (b) -#define vxor(d,a,b) (d) = (a) ^ (b) -#define vsel(d,a,b,c) (d) = bitselect ((a), (b), (c)) - void s1 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0F0F3333, x3C3C3C3C, x55FF55FF, x69C369C3, x0903B73F, x09FCB7C0, x5CA9E295; - u32 x55AFD1B7, x3C3C69C3, x6993B874; - u32 x5CEDE59F, x09FCE295, x5D91A51E, x529E962D; - u32 x29EEADC0, x4B8771A3, x428679F3, x6B68D433; - u32 x5BA7E193, x026F12F3, x6B27C493, x94D83B6C; - u32 x965E0B0F, x3327A113, x847F0A1F, xD6E19C32; - u32 x0DBCE883, x3A25A215, x37994A96; - u32 xC9C93B62, x89490F02, xB96C2D16; - u32 x0, x1, x2, x3; + u32 x55005500, x5A0F5A0F, x3333FFFF, x66666666, x22226666, x2D2D6969, + x25202160; + u32 x00FFFF00, x33CCCC33, x4803120C, x2222FFFF, x6A21EDF3, x4A01CC93; + u32 x5555FFFF, x7F75FFFF, x00D20096, x7FA7FF69; + u32 x0A0A0000, x0AD80096, x00999900, x0AD99996; + u32 x22332233, x257AA5F0, x054885C0, xFAB77A3F, x2221EDF3, xD89697CC; + u32 x05B77AC0, x05F77AD6, x36C48529, x6391D07C, xBB0747B0; + u32 x4C460000, x4EDF9996, x2D4E49EA, xBBFFFFB0, x96B1B65A; + u32 x5AFF5AFF, x52B11215, x4201C010, x10B0D205; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0F0F3333, a3, a2, a5); - vxor(x3C3C3C3C, a2, a3); - vor(x55FF55FF, a1, a4); - vxor(x69C369C3, x3C3C3C3C, x55FF55FF); - vsel(x0903B73F, a5, x0F0F3333, x69C369C3); - vxor(x09FCB7C0, a4, x0903B73F); - vxor(x5CA9E295, a1, x09FCB7C0); + x55005500 = a1 & ~a5; + x5A0F5A0F = a4 ^ x55005500; + x3333FFFF = a3 | a6; + x66666666 = a1 ^ a3; + x22226666 = x3333FFFF & x66666666; + x2D2D6969 = a4 ^ x22226666; + x25202160 = x2D2D6969 & ~x5A0F5A0F; - vsel(x55AFD1B7, x5CA9E295, x55FF55FF, x0F0F3333); - vsel(x3C3C69C3, x3C3C3C3C, x69C369C3, a5); - vxor(x6993B874, x55AFD1B7, x3C3C69C3); + x00FFFF00 = a5 ^ a6; + x33CCCC33 = a3 ^ x00FFFF00; + x4803120C = x5A0F5A0F & ~x33CCCC33; + x2222FFFF = a6 | x22226666; + x6A21EDF3 = x4803120C ^ x2222FFFF; + x4A01CC93 = x6A21EDF3 & ~x25202160; - vsel(x5CEDE59F, x55FF55FF, x5CA9E295, x6993B874); - vsel(x09FCE295, x09FCB7C0, x5CA9E295, a5); - vsel(x5D91A51E, x5CEDE59F, x6993B874, x09FCE295); - vxor(x529E962D, x0F0F3333, x5D91A51E); + x5555FFFF = a1 | a6; + x7F75FFFF = x6A21EDF3 | x5555FFFF; + x00D20096 = a5 & ~x2D2D6969; + x7FA7FF69 = x7F75FFFF ^ x00D20096; - vsel(x29EEADC0, x69C369C3, x09FCB7C0, x5CEDE59F); - vsel(x4B8771A3, x0F0F3333, x69C369C3, x5CA9E295); - vsel(x428679F3, a5, x4B8771A3, x529E962D); - vxor(x6B68D433, x29EEADC0, x428679F3); + x0A0A0000 = a4 & ~x5555FFFF; + x0AD80096 = x00D20096 ^ x0A0A0000; + x00999900 = x00FFFF00 & ~x66666666; + x0AD99996 = x0AD80096 | x00999900; - vsel(x5BA7E193, x5CA9E295, x4B8771A3, a3); - vsel(x026F12F3, a4, x0F0F3333, x529E962D); - vsel(x6B27C493, x6B68D433, x5BA7E193, x026F12F3); - vnot(x94D83B6C, x6B27C493); - vsel(x0, x94D83B6C, x6B68D433, a6); - vxor(*out1, *out1, x0); + x22332233 = a3 & ~x55005500; + x257AA5F0 = x5A0F5A0F ^ x7F75FFFF; + x054885C0 = x257AA5F0 & ~x22332233; + xFAB77A3F = ~x054885C0; + x2221EDF3 = x3333FFFF & x6A21EDF3; + xD89697CC = xFAB77A3F ^ x2221EDF3; + x20 = x7FA7FF69 & ~a2; + x21 = x20 ^ xD89697CC; + *out3 ^= x21; - vsel(x965E0B0F, x94D83B6C, a3, x428679F3); - vsel(x3327A113, x5BA7E193, a2, x69C369C3); - vsel(x847F0A1F, x965E0B0F, a4, x3327A113); - vxor(xD6E19C32, x529E962D, x847F0A1F); - vsel(x1, xD6E19C32, x5CA9E295, a6); - vxor(*out2, *out2, x1); + x05B77AC0 = x00FFFF00 ^ x054885C0; + x05F77AD6 = x00D20096 | x05B77AC0; + x36C48529 = x3333FFFF ^ x05F77AD6; + x6391D07C = a1 ^ x36C48529; + xBB0747B0 = xD89697CC ^ x6391D07C; + x00 = x25202160 | a2; + x01 = x00 ^ xBB0747B0; + *out1 ^= x01; - vsel(x0DBCE883, x09FCE295, x3C3C69C3, x847F0A1F); - vsel(x3A25A215, x3327A113, x5CA9E295, x0903B73F); - vxor(x37994A96, x0DBCE883, x3A25A215); - vsel(x3, x37994A96, x529E962D, a6); - vxor(*out4, *out4, x3); + x4C460000 = x3333FFFF ^ x7F75FFFF; + x4EDF9996 = x0AD99996 | x4C460000; + x2D4E49EA = x6391D07C ^ x4EDF9996; + xBBFFFFB0 = x00FFFF00 | xBB0747B0; + x96B1B65A = x2D4E49EA ^ xBBFFFFB0; + x10 = x4A01CC93 | a2; + x11 = x10 ^ x96B1B65A; + *out2 ^= x11; - vsel(xC9C93B62, x94D83B6C, x69C369C3, x5D91A51E); - vsel(x89490F02, a3, xC9C93B62, x965E0B0F); - vsel(xB96C2D16, x89490F02, x3C3C3C3C, x3A25A215); - vsel(x2, xB96C2D16, x6993B874, a6); - vxor(*out3, *out3, x2); + x5AFF5AFF = a5 | x5A0F5A0F; + x52B11215 = x5AFF5AFF & ~x2D4E49EA; + x4201C010 = x4A01CC93 & x6391D07C; + x10B0D205 = x52B11215 ^ x4201C010; + x30 = x10B0D205 | a2; + x31 = x30 ^ x0AD99996; + *out4 ^= x31; } void s2 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x55553333, x0055FF33, x33270F03, x66725A56, x00FFFF00, x668DA556; - u32 x0F0F5A56, xF0F0A5A9, xA5A5969A, xA55A699A; - u32 x0F5AF03C, x6600FF56, x87A5F09C; - u32 xA55A963C, x3C69C30F, xB44BC32D; - u32 x66D7CC56, x0F4B0F2D, x699CC37B, x996C66D2; - u32 xB46C662D, x278DB412, xB66CB43B; - u32 xD2DC4E52, x27993333, xD2994E33; - u32 x278D0F2D, x2E0E547B, x09976748; - u32 x0, x1, x2, x3; + u32 x33CC33CC; + u32 x55550000, x00AA00FF, x33BB33FF; + u32 x33CC0000, x11441144, x11BB11BB, x003311BB; + u32 x00000F0F, x336600FF, x332200FF, x332200F0; + u32 x0302000F, xAAAAAAAA, xA9A8AAA5, x33CCCC33, x33CCC030, x9A646A95; + u32 x00333303, x118822B8, xA8208805, x3CC3C33C, x94E34B39; + u32 x0331330C, x3FF3F33C, xA9DF596A, xA9DF5F6F, x962CAC53; + u32 xA9466A6A, x3DA52153, x29850143, x33C0330C, x1A45324F; + u32 x0A451047, xBBDFDD7B, xB19ACD3C; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x55553333, a1, a3, a6); - vsel(x0055FF33, a6, x55553333, a5); - vsel(x33270F03, a3, a4, x0055FF33); - vxor(x66725A56, a1, x33270F03); - vxor(x00FFFF00, a5, a6); - vxor(x668DA556, x66725A56, x00FFFF00); + x33CC33CC = a2 ^ a5; - vsel(x0F0F5A56, a4, x66725A56, a6); - vnot(xF0F0A5A9, x0F0F5A56); - vxor(xA5A5969A, x55553333, xF0F0A5A9); - vxor(xA55A699A, x00FFFF00, xA5A5969A); - vsel(x1, xA55A699A, x668DA556, a2); - vxor(*out2, *out2, x1); + x55550000 = a1 & ~a6; + x00AA00FF = a5 & ~x55550000; + x33BB33FF = a2 | x00AA00FF; - vxor(x0F5AF03C, a4, x0055FF33); - vsel(x6600FF56, x66725A56, a6, x00FFFF00); - vsel(x87A5F09C, xA5A5969A, x0F5AF03C, x6600FF56); + x33CC0000 = x33CC33CC & ~a6; + x11441144 = a1 & x33CC33CC; + x11BB11BB = a5 ^ x11441144; + x003311BB = x11BB11BB & ~x33CC0000; - vsel(xA55A963C, xA5A5969A, x0F5AF03C, a5); - vxor(x3C69C30F, a3, x0F5AF03C); - vsel(xB44BC32D, xA55A963C, x3C69C30F, a1); + x00000F0F = a3 & a6; + x336600FF = x00AA00FF ^ x33CC0000; + x332200FF = x33BB33FF & x336600FF; + x332200F0 = x332200FF & ~x00000F0F; - vsel(x66D7CC56, x66725A56, x668DA556, xA5A5969A); - vsel(x0F4B0F2D, a4, xB44BC32D, a5); - vxor(x699CC37B, x66D7CC56, x0F4B0F2D); - vxor(x996C66D2, xF0F0A5A9, x699CC37B); - vsel(x0, x996C66D2, xB44BC32D, a2); - vxor(*out1, *out1, x0); + x0302000F = a3 & x332200FF; + xAAAAAAAA = ~a1; + xA9A8AAA5 = x0302000F ^ xAAAAAAAA; + x33CCCC33 = a6 ^ x33CC33CC; + x33CCC030 = x33CCCC33 & ~x00000F0F; + x9A646A95 = xA9A8AAA5 ^ x33CCC030; + x10 = a4 & ~x332200F0; + x11 = x10 ^ x9A646A95; + *out2 ^= x11; - vsel(xB46C662D, xB44BC32D, x996C66D2, x00FFFF00); - vsel(x278DB412, x668DA556, xA5A5969A, a1); - vsel(xB66CB43B, xB46C662D, x278DB412, x6600FF56); + x00333303 = a2 & ~x33CCC030; + x118822B8 = x11BB11BB ^ x00333303; + xA8208805 = xA9A8AAA5 & ~x118822B8; + x3CC3C33C = a3 ^ x33CCCC33; + x94E34B39 = xA8208805 ^ x3CC3C33C; + x00 = x33BB33FF & ~a4; + x01 = x00 ^ x94E34B39; + *out1 ^= x01; - vsel(xD2DC4E52, x66D7CC56, x996C66D2, xB44BC32D); - vsel(x27993333, x278DB412, a3, x0055FF33); - vsel(xD2994E33, xD2DC4E52, x27993333, a5); - vsel(x3, x87A5F09C, xD2994E33, a2); - vxor(*out4, *out4, x3); + x0331330C = x0302000F ^ x00333303; + x3FF3F33C = x3CC3C33C | x0331330C; + xA9DF596A = x33BB33FF ^ x9A646A95; + xA9DF5F6F = x00000F0F | xA9DF596A; + x962CAC53 = x3FF3F33C ^ xA9DF5F6F; - vsel(x278D0F2D, x278DB412, x0F4B0F2D, a6); - vsel(x2E0E547B, x0F0F5A56, xB66CB43B, x278D0F2D); - vxor(x09976748, x27993333, x2E0E547B); - vsel(x2, xB66CB43B, x09976748, a2); - vxor(*out3, *out3, x2); + xA9466A6A = x332200FF ^ x9A646A95; + x3DA52153 = x94E34B39 ^ xA9466A6A; + x29850143 = xA9DF5F6F & x3DA52153; + x33C0330C = x33CC33CC & x3FF3F33C; + x1A45324F = x29850143 ^ x33C0330C; + x20 = x1A45324F | a4; + x21 = x20 ^ x962CAC53; + *out3 ^= x21; + + x0A451047 = x1A45324F & ~x118822B8; + xBBDFDD7B = x33CCCC33 | xA9DF596A; + xB19ACD3C = x0A451047 ^ xBBDFDD7B; + x30 = x003311BB | a4; + x31 = x30 ^ xB19ACD3C; + *out4 ^= x31; } void s3 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0F330F33, x0F33F0CC, x5A66A599; - u32 x2111B7BB, x03FF3033, x05BB50EE, x074F201F, x265E97A4; - u32 x556BA09E, x665A93AC, x99A56C53; - u32 x25A1A797, x5713754C, x66559355, x47B135C6; - u32 x9A5A5C60, xD07AF8F8, x87698DB4, xE13C1EE1; - u32 x000CFFCF, x9A485CCE, x0521DDF4, x9E49915E; - u32 xD069F8B4, x030FF0C3, xD2699876; - u32 xD579DDF4, xD579F0C3, xB32C6396; - u32 x0, x1, x2, x3; + u32 x44444444, x0F0FF0F0, x4F4FF4F4, x00FFFF00, x00AAAA00, x4FE55EF4; + u32 x3C3CC3C3, x3C3C0000, x7373F4F4, x0C840A00; + u32 x00005EF4, x00FF5EFF, x00555455, x3C699796; + u32 x000FF000, x55AA55AA, x26D9A15E, x2FDFAF5F, x2FD00F5F; + u32 x55AAFFAA, x28410014, x000000FF, x000000CC, x284100D8; + u32 x204100D0, x3C3CC3FF, x1C3CC32F, x4969967A; + u32 x4CC44CC4, x40C040C0, xC3C33C3C, x9669C396, xD6A98356; + u32 xD6E9C3D6, x4CEEEEC4, x9A072D12, x001A000B, x9A1F2D1B; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0F330F33, a4, a3, a5); - vxor(x0F33F0CC, a6, x0F330F33); - vxor(x5A66A599, a2, x0F33F0CC); + x44444444 = a1 & ~a2; + x0F0FF0F0 = a3 ^ a6; + x4F4FF4F4 = x44444444 | x0F0FF0F0; + x00FFFF00 = a4 ^ a6; + x00AAAA00 = x00FFFF00 & ~a1; + x4FE55EF4 = x4F4FF4F4 ^ x00AAAA00; - vsel(x2111B7BB, a3, a6, x5A66A599); - vsel(x03FF3033, a5, a3, x0F33F0CC); - vsel(x05BB50EE, a5, x0F33F0CC, a2); - vsel(x074F201F, x03FF3033, a4, x05BB50EE); - vxor(x265E97A4, x2111B7BB, x074F201F); + x3C3CC3C3 = a2 ^ x0F0FF0F0; + x3C3C0000 = x3C3CC3C3 & ~a6; + x7373F4F4 = x4F4FF4F4 ^ x3C3C0000; + x0C840A00 = x4FE55EF4 & ~x7373F4F4; - vsel(x556BA09E, x5A66A599, x05BB50EE, a4); - vsel(x665A93AC, x556BA09E, x265E97A4, a3); - vnot(x99A56C53, x665A93AC); - vsel(x1, x265E97A4, x99A56C53, a1); - vxor(*out2, *out2, x1); + x00005EF4 = a6 & x4FE55EF4; + x00FF5EFF = a4 | x00005EF4; + x00555455 = a1 & x00FF5EFF; + x3C699796 = x3C3CC3C3 ^ x00555455; + x30 = x4FE55EF4 & ~a5; + x31 = x30 ^ x3C699796; + *out4 ^= x31; - vxor(x25A1A797, x03FF3033, x265E97A4); - vsel(x5713754C, a2, x0F33F0CC, x074F201F); - vsel(x66559355, x665A93AC, a2, a5); - vsel(x47B135C6, x25A1A797, x5713754C, x66559355); + x000FF000 = x0F0FF0F0 & x00FFFF00; + x55AA55AA = a1 ^ a4; + x26D9A15E = x7373F4F4 ^ x55AA55AA; + x2FDFAF5F = a3 | x26D9A15E; + x2FD00F5F = x2FDFAF5F & ~x000FF000; - vxor(x9A5A5C60, x03FF3033, x99A56C53); - vsel(xD07AF8F8, x9A5A5C60, x556BA09E, x5A66A599); - vxor(x87698DB4, x5713754C, xD07AF8F8); - vxor(xE13C1EE1, x66559355, x87698DB4); + x55AAFFAA = x00AAAA00 | x55AA55AA; + x28410014 = x3C699796 & ~x55AAFFAA; + x000000FF = a4 & a6; + x000000CC = x000000FF & ~a2; + x284100D8 = x28410014 ^ x000000CC; - vsel(x000CFFCF, a4, a6, x0F33F0CC); - vsel(x9A485CCE, x9A5A5C60, x000CFFCF, x05BB50EE); - vsel(x0521DDF4, x87698DB4, a6, x9A5A5C60); - vsel(x9E49915E, x9A485CCE, x66559355, x0521DDF4); - vsel(x0, x9E49915E, xE13C1EE1, a1); - vxor(*out1, *out1, x0); + x204100D0 = x7373F4F4 & x284100D8; + x3C3CC3FF = x3C3CC3C3 | x000000FF; + x1C3CC32F = x3C3CC3FF & ~x204100D0; + x4969967A = a1 ^ x1C3CC32F; + x10 = x2FD00F5F & a5; + x11 = x10 ^ x4969967A; + *out2 ^= x11; - vsel(xD069F8B4, xD07AF8F8, x87698DB4, a5); - vsel(x030FF0C3, x000CFFCF, x03FF3033, a4); - vsel(xD2699876, xD069F8B4, x9E49915E, x030FF0C3); - vsel(x3, x5A66A599, xD2699876, a1); - vxor(*out4, *out4, x3); + x4CC44CC4 = x4FE55EF4 & ~a2; + x40C040C0 = x4CC44CC4 & ~a3; + xC3C33C3C = ~x3C3CC3C3; + x9669C396 = x55AAFFAA ^ xC3C33C3C; + xD6A98356 = x40C040C0 ^ x9669C396; + x00 = a5 & ~x0C840A00; + x01 = x00 ^ xD6A98356; + *out1 ^= x01; - vsel(xD579DDF4, xD07AF8F8, a2, x5713754C); - vsel(xD579F0C3, xD579DDF4, x030FF0C3, a6); - vxor(xB32C6396, x66559355, xD579F0C3); - vsel(x2, xB32C6396, x47B135C6, a1); - vxor(*out3, *out3, x2); + xD6E9C3D6 = x40C040C0 | x9669C396; + x4CEEEEC4 = x00AAAA00 | x4CC44CC4; + x9A072D12 = xD6E9C3D6 ^ x4CEEEEC4; + x001A000B = a4 & ~x4FE55EF4; + x9A1F2D1B = x9A072D12 | x001A000B; + x20 = a5 & ~x284100D8; + x21 = x20 ^ x9A1F2D1B; + *out3 ^= x21; } void s4 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0505AFAF, x0555AF55, x0A5AA05A, x46566456, x0A0A5F5F, x0AF55FA0, - x0AF50F0F, x4CA36B59; - u32 xB35C94A6; - u32 x01BB23BB, x5050FAFA, xA31C26BE, xA91679E1; - u32 x56E9861E; - u32 x50E9FA1E, x0AF55F00, x827D9784, xD2946D9A; - u32 x31F720B3, x11FB21B3, x4712A7AD, x9586CA37; - u32 x0, x1, x2, x3; + u32 x5A5A5A5A, x0F0FF0F0; + u32 x33FF33FF, x33FFCC00, x0C0030F0, x0C0CC0C0, x0CF3C03F, x5EFBDA7F, + x52FBCA0F, x61C8F93C; + u32 x00C0C03C, x0F0F30C0, x3B92A366, x30908326, x3C90B3D6; + u32 x33CC33CC, x0C0CFFFF, x379E5C99, x04124C11, x56E9861E, xA91679E1; + u32 x9586CA37, x8402C833, x84C2C83F, xB35C94A6; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0505AFAF, a5, a3, a1); - vsel(x0555AF55, x0505AFAF, a1, a4); - vxor(x0A5AA05A, a3, x0555AF55); - vsel(x46566456, a1, x0A5AA05A, a2); - vsel(x0A0A5F5F, a3, a5, a1); - vxor(x0AF55FA0, a4, x0A0A5F5F); - vsel(x0AF50F0F, x0AF55FA0, a3, a5); - vxor(x4CA36B59, x46566456, x0AF50F0F); + x5A5A5A5A = a1 ^ a3; + x0F0FF0F0 = a3 ^ a5; + x33FF33FF = a2 | a4; + x33FFCC00 = a5 ^ x33FF33FF; + x0C0030F0 = x0F0FF0F0 & ~x33FFCC00; + x0C0CC0C0 = x0F0FF0F0 & ~a2; + x0CF3C03F = a4 ^ x0C0CC0C0; + x5EFBDA7F = x5A5A5A5A | x0CF3C03F; + x52FBCA0F = x5EFBDA7F & ~x0C0030F0; + x61C8F93C = a2 ^ x52FBCA0F; - vnot(xB35C94A6, x4CA36B59); + x00C0C03C = x0CF3C03F & x61C8F93C; + x0F0F30C0 = x0F0FF0F0 & ~x00C0C03C; + x3B92A366 = x5A5A5A5A ^ x61C8F93C; + x30908326 = x3B92A366 & ~x0F0F30C0; + x3C90B3D6 = x0C0030F0 ^ x30908326; - vsel(x01BB23BB, a4, a2, x0555AF55); - vxor(x5050FAFA, a1, x0505AFAF); - vsel(xA31C26BE, xB35C94A6, x01BB23BB, x5050FAFA); - vxor(xA91679E1, x0A0A5F5F, xA31C26BE); + x33CC33CC = a2 ^ a4; + x0C0CFFFF = a5 | x0C0CC0C0; + x379E5C99 = x3B92A366 ^ x0C0CFFFF; + x04124C11 = x379E5C99 & ~x33CC33CC; + x56E9861E = x52FBCA0F ^ x04124C11; + x00 = a6 & ~x3C90B3D6; + x01 = x00 ^ x56E9861E; + *out1 ^= x01; - vnot(x56E9861E, xA91679E1); + xA91679E1 = ~x56E9861E; + x10 = x3C90B3D6 & ~a6; + x11 = x10 ^ xA91679E1; + *out2 ^= x11; - vsel(x50E9FA1E, x5050FAFA, x56E9861E, a4); - vsel(x0AF55F00, x0AF50F0F, x0AF55FA0, x0A0A5F5F); - vsel(x827D9784, xB35C94A6, x0AF55F00, a2); - vxor(xD2946D9A, x50E9FA1E, x827D9784); - vsel(x2, xD2946D9A, x4CA36B59, a6); - vxor(*out3, *out3, x2); - vsel(x3, xB35C94A6, xD2946D9A, a6); - vxor(*out4, *out4, x3); + x9586CA37 = x3C90B3D6 ^ xA91679E1; + x8402C833 = x9586CA37 & ~x33CC33CC; + x84C2C83F = x00C0C03C | x8402C833; + xB35C94A6 = x379E5C99 ^ x84C2C83F; + x20 = x61C8F93C | a6; + x21 = x20 ^ xB35C94A6; + *out3 ^= x21; - vsel(x31F720B3, a2, a4, x0AF55FA0); - vsel(x11FB21B3, x01BB23BB, x31F720B3, x5050FAFA); - vxor(x4712A7AD, x56E9861E, x11FB21B3); - vxor(x9586CA37, xD2946D9A, x4712A7AD); - vsel(x0, x56E9861E, x9586CA37, a6); - vxor(*out1, *out1, x0); - vsel(x1, x9586CA37, xA91679E1, a6); - vxor(*out2, *out2, x1); + x30 = a6 & x61C8F93C; + x31 = x30 ^ xB35C94A6; + *out4 ^= x31; } void s5 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x550F550F, xAAF0AAF0, xA5F5A5F5, x96C696C6, x00FFFF00, x963969C6; - u32 x2E3C2E3C, xB73121F7, x1501DF0F, x00558A5F, x2E69A463; - u32 x0679ED42, x045157FD, xB32077FF, x9D49D39C; - u32 xAC81CFB2, xF72577AF, x5BA4B81D; - u32 x5BA477AF, x4895469F, x3A35273A, x1A35669A; - u32 x12E6283D, x9E47D3D4, x1A676AB4; - u32 x891556DF, xE5E77F82, x6CF2295D; - u32 x2E3CA5F5, x9697C1C6, x369CC1D6; - u32 x0, x1, x2, x3; + u32 x77777777, x77770000, x22225555, x11116666, x1F1F6F6F; + u32 x70700000, x43433333, x00430033, x55557777, x55167744, x5A19784B; + u32 x5A1987B4, x7A3BD7F5, x003B00F5, x221955A0, x05050707, x271C52A7; + u32 x2A2A82A0, x6969B193, x1FE06F90, x16804E00, xE97FB1FF; + u32 x43403302, x35CAED30, x37DEFFB7, x349ECCB5, x0B01234A; + u32 x101884B4, x0FF8EB24, x41413333, x4FF9FB37, x4FC2FBC2; + u32 x22222222, x16BCEE97, x0F080B04, x19B4E593; + u32 x5C5C5C5C, x4448184C, x2DDABE71, x6992A63D; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x550F550F, a1, a3, a5); - vnot(xAAF0AAF0, x550F550F); - vsel(xA5F5A5F5, xAAF0AAF0, a1, a3); - vxor(x96C696C6, a2, xA5F5A5F5); - vxor(x00FFFF00, a5, a6); - vxor(x963969C6, x96C696C6, x00FFFF00); + x77777777 = a1 | a3; + x77770000 = x77777777 & ~a6; + x22225555 = a1 ^ x77770000; + x11116666 = a3 ^ x22225555; + x1F1F6F6F = a4 | x11116666; - vsel(x2E3C2E3C, a3, xAAF0AAF0, a2); - vsel(xB73121F7, a2, x963969C6, x96C696C6); - vsel(x1501DF0F, a6, x550F550F, xB73121F7); - vsel(x00558A5F, x1501DF0F, a5, a1); - vxor(x2E69A463, x2E3C2E3C, x00558A5F); + x70700000 = x77770000 & ~a4; + x43433333 = a3 ^ x70700000; + x00430033 = a5 & x43433333; + x55557777 = a1 | x11116666; + x55167744 = x00430033 ^ x55557777; + x5A19784B = a4 ^ x55167744; - vsel(x0679ED42, x00FFFF00, x2E69A463, x96C696C6); - vsel(x045157FD, a6, a1, x0679ED42); - vsel(xB32077FF, xB73121F7, a6, x045157FD); - vxor(x9D49D39C, x2E69A463, xB32077FF); - vsel(x2, x9D49D39C, x2E69A463, a4); - vxor(*out3, *out3, x2); + x5A1987B4 = a6 ^ x5A19784B; + x7A3BD7F5 = x22225555 | x5A1987B4; + x003B00F5 = a5 & x7A3BD7F5; + x221955A0 = x22225555 ^ x003B00F5; + x05050707 = a4 & x55557777; + x271C52A7 = x221955A0 ^ x05050707; - vsel(xAC81CFB2, xAAF0AAF0, x1501DF0F, x0679ED42); - vsel(xF72577AF, xB32077FF, x550F550F, a1); - vxor(x5BA4B81D, xAC81CFB2, xF72577AF); - vsel(x1, x5BA4B81D, x963969C6, a4); - vxor(*out2, *out2, x1); + x2A2A82A0 = x7A3BD7F5 & ~a1; + x6969B193 = x43433333 ^ x2A2A82A0; + x1FE06F90 = a5 ^ x1F1F6F6F; + x16804E00 = x1FE06F90 & ~x6969B193; + xE97FB1FF = ~x16804E00; + x20 = xE97FB1FF & ~a2; + x21 = x20 ^ x5A19784B; + *out3 ^= x21; - vsel(x5BA477AF, x5BA4B81D, xF72577AF, a6); - vsel(x4895469F, x5BA477AF, x00558A5F, a2); - vsel(x3A35273A, x2E3C2E3C, a2, x963969C6); - vsel(x1A35669A, x4895469F, x3A35273A, x5BA4B81D); + x43403302 = x43433333 & ~x003B00F5; + x35CAED30 = x2A2A82A0 ^ x1FE06F90; + x37DEFFB7 = x271C52A7 | x35CAED30; + x349ECCB5 = x37DEFFB7 & ~x43403302; + x0B01234A = x1F1F6F6F & ~x349ECCB5; - vsel(x12E6283D, a5, x5BA4B81D, x963969C6); - vsel(x9E47D3D4, x96C696C6, x9D49D39C, xAC81CFB2); - vsel(x1A676AB4, x12E6283D, x9E47D3D4, x4895469F); + x101884B4 = x5A1987B4 & x349ECCB5; + x0FF8EB24 = x1FE06F90 ^ x101884B4; + x41413333 = x43433333 & x55557777; + x4FF9FB37 = x0FF8EB24 | x41413333; + x4FC2FBC2 = x003B00F5 ^ x4FF9FB37; + x30 = x4FC2FBC2 & a2; + x31 = x30 ^ x271C52A7; + *out4 ^= x31; - vsel(x891556DF, xB32077FF, x4895469F, x3A35273A); - vsel(xE5E77F82, xF72577AF, x00FFFF00, x12E6283D); - vxor(x6CF2295D, x891556DF, xE5E77F82); - vsel(x3, x1A35669A, x6CF2295D, a4); - vxor(*out4, *out4, x3); + x22222222 = a1 ^ x77777777; + x16BCEE97 = x349ECCB5 ^ x22222222; + x0F080B04 = a4 & x0FF8EB24; + x19B4E593 = x16BCEE97 ^ x0F080B04; + x00 = x0B01234A | a2; + x01 = x00 ^ x19B4E593; + *out1 ^= x01; - vsel(x2E3CA5F5, x2E3C2E3C, xA5F5A5F5, a6); - vsel(x9697C1C6, x96C696C6, x963969C6, x045157FD); - vsel(x369CC1D6, x2E3CA5F5, x9697C1C6, x5BA477AF); - vsel(x0, x369CC1D6, x1A676AB4, a4); - vxor(*out1, *out1, x0); + x5C5C5C5C = x1F1F6F6F ^ x43433333; + x4448184C = x5C5C5C5C & ~x19B4E593; + x2DDABE71 = x22225555 ^ x0FF8EB24; + x6992A63D = x4448184C ^ x2DDABE71; + x10 = x1F1F6F6F & a2; + x11 = x10 ^ x6992A63D; + *out2 ^= x11; } void s6 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x555500FF, x666633CC, x606F30CF, x353A659A, x353A9A65, xCAC5659A; - u32 x353A6565, x0A3F0A6F, x6C5939A3, x5963A3C6; - u32 x35FF659A, x3AF06A95, x05CF0A9F, x16E94A97; - u32 x86CD4C9B, x12E0FFFD, x942D9A67; - u32 x142956AB, x455D45DF, x1C3EE619; - u32 x2AEA70D5, x20CF7A9F, x3CF19C86, x69A49C79; - u32 x840DBB67, x6DA19C1E, x925E63E1; - u32 x9C3CA761, x257A75D5, xB946D2B4; - u32 x0, x1, x2, x3; + u32 x33CC33CC; + u32 x3333FFFF, x11115555, x22DD6699, x22DD9966, x00220099; + u32 x00551144, x33662277, x5A5A5A5A, x7B7E7A7F, x59A31CE6; + u32 x09030C06, x09030000, x336622FF, x3A6522FF; + u32 x484D494C, x0000B6B3, x0F0FB9BC, x00FC00F9, x0FFFB9FD; + u32 x5DF75DF7, x116600F7, x1E69B94B, x1668B94B; + u32 x7B7B7B7B, x411E5984, x1FFFFDFD, x5EE1A479; + u32 x3CB4DFD2, x004B002D, xB7B2B6B3, xCCC9CDC8, xCC82CDE5; + u32 x0055EEBB, x5A5AECE9, x0050ECA9, xC5CAC1CE, xC59A2D67; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x555500FF, a1, a4, a5); - vxor(x666633CC, a2, x555500FF); - vsel(x606F30CF, x666633CC, a4, a3); - vxor(x353A659A, a1, x606F30CF); - vxor(x353A9A65, a5, x353A659A); - vnot(xCAC5659A, x353A9A65); + x33CC33CC = a2 ^ a5; - vsel(x353A6565, x353A659A, x353A9A65, a4); - vsel(x0A3F0A6F, a3, a4, x353A6565); - vxor(x6C5939A3, x666633CC, x0A3F0A6F); - vxor(x5963A3C6, x353A9A65, x6C5939A3); + x3333FFFF = a2 | a6; + x11115555 = a1 & x3333FFFF; + x22DD6699 = x33CC33CC ^ x11115555; + x22DD9966 = a6 ^ x22DD6699; + x00220099 = a5 & ~x22DD9966; - vsel(x35FF659A, a4, x353A659A, x353A6565); - vxor(x3AF06A95, a3, x35FF659A); - vsel(x05CF0A9F, a4, a3, x353A9A65); - vsel(x16E94A97, x3AF06A95, x05CF0A9F, x6C5939A3); + x00551144 = a1 & x22DD9966; + x33662277 = a2 ^ x00551144; + x5A5A5A5A = a1 ^ a3; + x7B7E7A7F = x33662277 | x5A5A5A5A; + x59A31CE6 = x22DD6699 ^ x7B7E7A7F; - vsel(x86CD4C9B, xCAC5659A, x05CF0A9F, x6C5939A3); - vsel(x12E0FFFD, a5, x3AF06A95, x16E94A97); - vsel(x942D9A67, x86CD4C9B, x353A9A65, x12E0FFFD); - vsel(x0, xCAC5659A, x942D9A67, a6); - vxor(*out1, *out1, x0); + x09030C06 = a3 & x59A31CE6; + x09030000 = x09030C06 & ~a6; + x336622FF = x00220099 | x33662277; + x3A6522FF = x09030000 ^ x336622FF; + x30 = x3A6522FF & a4; + x31 = x30 ^ x59A31CE6; + *out4 ^= x31; - vsel(x142956AB, x353A659A, x942D9A67, a2); - vsel(x455D45DF, a1, x86CD4C9B, x142956AB); - vxor(x1C3EE619, x5963A3C6, x455D45DF); - vsel(x3, x5963A3C6, x1C3EE619, a6); - vxor(*out4, *out4, x3); + x484D494C = a2 ^ x7B7E7A7F; + x0000B6B3 = a6 & ~x484D494C; + x0F0FB9BC = a3 ^ x0000B6B3; + x00FC00F9 = a5 & ~x09030C06; + x0FFFB9FD = x0F0FB9BC | x00FC00F9; - vsel(x2AEA70D5, x3AF06A95, x606F30CF, x353A9A65); - vsel(x20CF7A9F, x2AEA70D5, x05CF0A9F, x0A3F0A6F); - vxor(x3CF19C86, x1C3EE619, x20CF7A9F); - vxor(x69A49C79, x555500FF, x3CF19C86); + x5DF75DF7 = a1 | x59A31CE6; + x116600F7 = x336622FF & x5DF75DF7; + x1E69B94B = x0F0FB9BC ^ x116600F7; + x1668B94B = x1E69B94B & ~x09030000; + x20 = x00220099 | a4; + x21 = x20 ^ x1668B94B; + *out3 ^= x21; - vsel(x840DBB67, a5, x942D9A67, x86CD4C9B); - vsel(x6DA19C1E, x69A49C79, x3CF19C86, x840DBB67); - vnot(x925E63E1, x6DA19C1E); - vsel(x1, x925E63E1, x69A49C79, a6); - vxor(*out2, *out2, x1); + x7B7B7B7B = a2 | x5A5A5A5A; + x411E5984 = x3A6522FF ^ x7B7B7B7B; + x1FFFFDFD = x11115555 | x0FFFB9FD; + x5EE1A479 = x411E5984 ^ x1FFFFDFD; - vsel(x9C3CA761, x840DBB67, x1C3EE619, x3CF19C86); - vsel(x257A75D5, x455D45DF, x2AEA70D5, x606F30CF); - vxor(xB946D2B4, x9C3CA761, x257A75D5); - vsel(x2, x16E94A97, xB946D2B4, a6); - vxor(*out3, *out3, x2); + x3CB4DFD2 = x22DD6699 ^ x1E69B94B; + x004B002D = a5 & ~x3CB4DFD2; + xB7B2B6B3 = ~x484D494C; + xCCC9CDC8 = x7B7B7B7B ^ xB7B2B6B3; + xCC82CDE5 = x004B002D ^ xCCC9CDC8; + x10 = xCC82CDE5 & ~a4; + x11 = x10 ^ x5EE1A479; + *out2 ^= x11; + + x0055EEBB = a6 ^ x00551144; + x5A5AECE9 = a1 ^ x0F0FB9BC; + x0050ECA9 = x0055EEBB & x5A5AECE9; + xC5CAC1CE = x09030C06 ^ xCCC9CDC8; + xC59A2D67 = x0050ECA9 ^ xC5CAC1CE; + x00 = x0FFFB9FD & ~a4; + x01 = x00 ^ xC59A2D67; + *out1 ^= x01; } void s7 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x44447777, x4B4B7878, x22772277, x0505F5F5, x220522F5, x694E5A8D; - u32 x00FFFF00, x66666666, x32353235, x26253636, x26DAC936; - u32 x738F9C63, x11EF9867, x26DA9867; - u32 x4B4B9C63, x4B666663, x4E639396; - u32 x4E4B393C, xFF00FF00, xFF05DD21, xB14EE41D; - u32 xD728827B, x6698807B, x699C585B; - u32 x778A8877, xA4A71E18, x74878E78; - u32 x204A5845, x74879639, x8B7869C6; - u32 x0, x1, x2, x3; + u32 x0FF00FF0, x3CC33CC3, x00003CC3, x0F000F00, x5A555A55, x00001841; + u32 x00000F00, x33333C33, x7B777E77, x0FF0F00F, x74878E78; + u32 x003C003C, x5A7D5A7D, x333300F0, x694E5A8D; + u32 x0FF0CCCC, x000F0303, x5A505854, x33CC000F, x699C585B; + u32 x7F878F78, x21101013, x7F979F7B, x30030CC0, x4F9493BB; + u32 x6F9CDBFB, x0000DBFB, x00005151, x26DAC936, x26DA9867; + u32 x27DA9877, x27DA438C, x2625C9C9, x27FFCBCD; + u32 x27FF1036, x27FF103E, xB06B6C44, x97947C7A; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x44447777, a2, a6, a3); - vxor(x4B4B7878, a4, x44447777); - vsel(x22772277, a3, a5, a2); - vsel(x0505F5F5, a6, a2, a4); - vsel(x220522F5, x22772277, x0505F5F5, a5); - vxor(x694E5A8D, x4B4B7878, x220522F5); + x0FF00FF0 = a4 ^ a5; + x3CC33CC3 = a3 ^ x0FF00FF0; + x00003CC3 = a6 & x3CC33CC3; + x0F000F00 = a4 & x0FF00FF0; + x5A555A55 = a2 ^ x0F000F00; + x00001841 = x00003CC3 & x5A555A55; - vxor(x00FFFF00, a5, a6); - vxor(x66666666, a2, a3); - vsel(x32353235, a3, x220522F5, a4); - vsel(x26253636, x66666666, x32353235, x4B4B7878); - vxor(x26DAC936, x00FFFF00, x26253636); - vsel(x0, x26DAC936, x694E5A8D, a1); - vxor(*out1, *out1, x0); + x00000F00 = a6 & x0F000F00; + x33333C33 = a3 ^ x00000F00; + x7B777E77 = x5A555A55 | x33333C33; + x0FF0F00F = a6 ^ x0FF00FF0; + x74878E78 = x7B777E77 ^ x0FF0F00F; + x30 = a1 & ~x00001841; + x31 = x30 ^ x74878E78; + *out4 ^= x31; - vxor(x738F9C63, a2, x26DAC936); - vsel(x11EF9867, x738F9C63, a5, x66666666); - vsel(x26DA9867, x26DAC936, x11EF9867, a6); + x003C003C = a5 & ~x3CC33CC3; + x5A7D5A7D = x5A555A55 | x003C003C; + x333300F0 = x00003CC3 ^ x33333C33; + x694E5A8D = x5A7D5A7D ^ x333300F0; - vsel(x4B4B9C63, x4B4B7878, x738F9C63, a6); - vsel(x4B666663, x4B4B9C63, x66666666, x00FFFF00); - vxor(x4E639396, x0505F5F5, x4B666663); + x0FF0CCCC = x00003CC3 ^ x0FF0F00F; + x000F0303 = a4 & ~x0FF0CCCC; + x5A505854 = x5A555A55 & ~x000F0303; + x33CC000F = a5 ^ x333300F0; + x699C585B = x5A505854 ^ x33CC000F; - vsel(x4E4B393C, x4B4B7878, x4E639396, a2); - vnot(xFF00FF00, a5); - vsel(xFF05DD21, xFF00FF00, x738F9C63, x32353235); - vxor(xB14EE41D, x4E4B393C, xFF05DD21); - vsel(x1, xB14EE41D, x26DA9867, a1); - vxor(*out2, *out2, x1); + x7F878F78 = x0F000F00 | x74878E78; + x21101013 = a3 & x699C585B; + x7F979F7B = x7F878F78 | x21101013; + x30030CC0 = x3CC33CC3 & ~x0FF0F00F; + x4F9493BB = x7F979F7B ^ x30030CC0; + x00 = x4F9493BB & ~a1; + x01 = x00 ^ x694E5A8D; + *out1 ^= x01; - vxor(xD728827B, x66666666, xB14EE41D); - vsel(x6698807B, x26DA9867, xD728827B, x4E4B393C); - vsel(x699C585B, x6698807B, x694E5A8D, xFF05DD21); - vsel(x2, x699C585B, x4E639396, a1); - vxor(*out3, *out3, x2); + x6F9CDBFB = x699C585B | x4F9493BB; + x0000DBFB = a6 & x6F9CDBFB; + x00005151 = a2 & x0000DBFB; + x26DAC936 = x694E5A8D ^ x4F9493BB; + x26DA9867 = x00005151 ^ x26DAC936; - vsel(x778A8877, x738F9C63, x26DAC936, x26253636); - vxor(xA4A71E18, x738F9C63, xD728827B); - vsel(x74878E78, x778A8877, xA4A71E18, a4); + x27DA9877 = x21101013 | x26DA9867; + x27DA438C = x0000DBFB ^ x27DA9877; + x2625C9C9 = a5 ^ x26DAC936; + x27FFCBCD = x27DA438C | x2625C9C9; + x20 = x27FFCBCD & a1; + x21 = x20 ^ x699C585B; + *out3 ^= x21; - vsel(x204A5845, x26DA9867, x694E5A8D, x26DAC936); - vsel(x74879639, x74878E78, a3, x204A5845); - vnot(x8B7869C6, x74879639); - vsel(x3, x74878E78, x8B7869C6, a1); - vxor(*out4, *out4, x3); + x27FF1036 = x0000DBFB ^ x27FFCBCD; + x27FF103E = x003C003C | x27FF1036; + xB06B6C44 = ~x4F9493BB; + x97947C7A = x27FF103E ^ xB06B6C44; + x10 = x97947C7A & ~a1; + x11 = x10 ^ x26DA9867; + *out2 ^= x11; } void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0505F5F5, x05FAF50A, x0F0F00FF, x22227777, x07DA807F, x34E9B34C; - u32 x00FFF00F, x0033FCCF, x5565B15C, x0C0C3F3F, x59698E63; - u32 x3001F74E, x30555745, x693CD926; - u32 x0C0CD926, x0C3F25E9, x38D696A5; - u32 xC729695A; - u32 x03D2117B, xC778395B, xCB471CB2; - u32 x5425B13F, x56B3803F, x919AE965; - u32 x17B3023F, x75555755, x62E6556A, xA59E6C31; - u32 x0, x1, x2, x3; + u32 x0C0C0C0C, x0000F0F0, x00FFF00F, x00555005, x00515001; + u32 x33000330, x77555775, x30303030, x3030CFCF, x30104745, x30555745; + u32 xFF000FF0, xCF1048B5, x080A080A, xC71A40BF, xCB164CB3; + u32 x9E4319E6, x000019E6, xF429738C, xF4296A6A, xC729695A; + u32 xC47C3D2F, xF77F3F3F, x9E43E619, x693CD926; + u32 xF719A695, xF4FF73FF, x03E6D56A, x56B3803F; + u32 xF700A600, x61008000, x03B7856B, x62B7056B; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0505F5F5, a5, a1, a3); - vxor(x05FAF50A, a4, x0505F5F5); - vsel(x0F0F00FF, a3, a4, a5); - vsel(x22227777, a2, a5, a1); - vsel(x07DA807F, x05FAF50A, x0F0F00FF, x22227777); - vxor(x34E9B34C, a2, x07DA807F); + x0C0C0C0C = a3 & ~a2; + x0000F0F0 = a5 & ~a3; + x00FFF00F = a4 ^ x0000F0F0; + x00555005 = a1 & x00FFF00F; + x00515001 = x00555005 & ~x0C0C0C0C; - vsel(x00FFF00F, x05FAF50A, a4, a3); - vsel(x0033FCCF, a5, x00FFF00F, a2); - vsel(x5565B15C, a1, x34E9B34C, x0033FCCF); - vsel(x0C0C3F3F, a3, a5, a2); - vxor(x59698E63, x5565B15C, x0C0C3F3F); + x33000330 = a2 & ~x00FFF00F; + x77555775 = a1 | x33000330; + x30303030 = a2 & ~a3; + x3030CFCF = a5 ^ x30303030; + x30104745 = x77555775 & x3030CFCF; + x30555745 = x00555005 | x30104745; - vsel(x3001F74E, x34E9B34C, a5, x05FAF50A); - vsel(x30555745, x3001F74E, a1, x00FFF00F); - vxor(x693CD926, x59698E63, x30555745); - vsel(x2, x693CD926, x59698E63, a6); - vxor(*out3, *out3, x2); + xFF000FF0 = ~x00FFF00F; + xCF1048B5 = x30104745 ^ xFF000FF0; + x080A080A = a3 & ~x77555775; + xC71A40BF = xCF1048B5 ^ x080A080A; + xCB164CB3 = x0C0C0C0C ^ xC71A40BF; + x10 = x00515001 | a6; + x11 = x10 ^ xCB164CB3; + *out2 ^= x11; - vsel(x0C0CD926, x0C0C3F3F, x693CD926, a5); - vxor(x0C3F25E9, x0033FCCF, x0C0CD926); - vxor(x38D696A5, x34E9B34C, x0C3F25E9); + x9E4319E6 = a1 ^ xCB164CB3; + x000019E6 = a5 & x9E4319E6; + xF429738C = a2 ^ xC71A40BF; + xF4296A6A = x000019E6 ^ xF429738C; + xC729695A = x33000330 ^ xF4296A6A; - vnot(xC729695A, x38D696A5); + xC47C3D2F = x30555745 ^ xF4296A6A; + xF77F3F3F = a2 | xC47C3D2F; + x9E43E619 = a5 ^ x9E4319E6; + x693CD926 = xF77F3F3F ^ x9E43E619; + x20 = x30555745 & a6; + x21 = x20 ^ x693CD926; + *out3 ^= x21; - vsel(x03D2117B, x07DA807F, a2, x0C0CD926); - vsel(xC778395B, xC729695A, x03D2117B, x30555745); - vxor(xCB471CB2, x0C3F25E9, xC778395B); - vsel(x1, xCB471CB2, x34E9B34C, a6); - vxor(*out2, *out2, x1); + xF719A695 = x3030CFCF ^ xC729695A; + xF4FF73FF = a4 | xF429738C; + x03E6D56A = xF719A695 ^ xF4FF73FF; + x56B3803F = a1 ^ x03E6D56A; + x30 = x56B3803F & a6; + x31 = x30 ^ xC729695A; + *out4 ^= x31; - vsel(x5425B13F, x5565B15C, x0C0C3F3F, x03D2117B); - vsel(x56B3803F, x07DA807F, x5425B13F, x59698E63); - vxor(x919AE965, xC729695A, x56B3803F); - vsel(x3, xC729695A, x919AE965, a6); - vxor(*out4, *out4, x3); - - vsel(x17B3023F, x07DA807F, a2, x59698E63); - vor(x75555755, a1, x30555745); - vxor(x62E6556A, x17B3023F, x75555755); - vxor(xA59E6C31, xC778395B, x62E6556A); - vsel(x0, xA59E6C31, x38D696A5, a6); - vxor(*out1, *out1, x0); + xF700A600 = xF719A695 & ~a4; + x61008000 = x693CD926 & xF700A600; + x03B7856B = x00515001 ^ x03E6D56A; + x62B7056B = x61008000 ^ x03B7856B; + x00 = x62B7056B | a6; + x01 = x00 ^ xC729695A; + *out1 ^= x01; } #endif +//#define SWAP(a, b) { u32 tmp=*a;*a=*b;*b=tmp; } #define SWAP(a, b) { u32 tmp=*a;*a=*b;*b=tmp; } #define DATASWAP \ @@ -1431,37 +1532,24 @@ void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, c #define KEYSET07 { k00 = K31; k01 = K35; k02 = K52; k03 = K43; k04 = K08; k05 = K37; k06 = K51; k07 = K15; k08 = K49; k09 = K30; k10 = K07; k11 = K02; k12 = K50; k13 = K21; k14 = K45; k15 = K44; k16 = K29; k17 = K16; k18 = K42; k19 = K23; k20 = K22; k21 = K14; k22 = K38; k23 = K01; k24 = K10; k25 = K47; k26 = K53; k27 = K11; k28 = K27; k29 = K26; k30 = K05; k31 = K17; k32 = K54; k33 = K41; k34 = K39; k35 = K20; k36 = K48; k37 = K13; k38 = K24; k39 = K19; k40 = K32; k41 = K40; k42 = K34; k43 = K03; k44 = K06; k45 = K18; k46 = K12; k47 = K46; } #define KEYSET17 { k00 = K15; k01 = K51; k02 = K36; k03 = K02; k04 = K49; k05 = K21; k06 = K35; k07 = K31; k08 = K08; k09 = K14; k10 = K23; k11 = K43; k12 = K09; k13 = K37; k14 = K29; k15 = K28; k16 = K45; k17 = K00; k18 = K01; k19 = K07; k20 = K38; k21 = K30; k22 = K22; k23 = K42; k24 = K26; k25 = K04; k26 = K41; k27 = K54; k28 = K39; k29 = K10; k30 = K48; k31 = K33; k32 = K11; k33 = K53; k34 = K27; k35 = K32; k36 = K05; k37 = K25; k38 = K40; k39 = K03; k40 = K20; k41 = K24; k42 = K46; k43 = K19; k44 = K18; k45 = K06; k46 = K55; k47 = K34; } -#ifdef IS_NV -#define KXX_DECL -#define sXXX_DECL -#endif - -#ifdef IS_AMD -#define KXX_DECL -#define sXXX_DECL -#endif - -#ifdef IS_GENERIC -#define KXX_DECL -#define sXXX_DECL -#endif +#define myselx(a,b,c) ((c) ? (b) : (a)) #ifdef DESCRYPT_SALT void DESCrypt (const u32 SALT, const u32 K00, const u32 K01, const u32 K02, const u32 K03, const u32 K04, const u32 K05, const u32 K06, const u32 K07, const u32 K08, const u32 K09, const u32 K10, const u32 K11, const u32 K12, const u32 K13, const u32 K14, const u32 K15, const u32 K16, const u32 K17, const u32 K18, const u32 K19, const u32 K20, const u32 K21, const u32 K22, const u32 K23, const u32 K24, const u32 K25, const u32 K26, const u32 K27, const u32 K28, const u32 K29, const u32 K30, const u32 K31, const u32 K32, const u32 K33, const u32 K34, const u32 K35, const u32 K36, const u32 K37, const u32 K38, const u32 K39, const u32 K40, const u32 K41, const u32 K42, const u32 K43, const u32 K44, const u32 K45, const u32 K46, const u32 K47, const u32 K48, const u32 K49, const u32 K50, const u32 K51, const u32 K52, const u32 K53, const u32 K54, const u32 K55, u32 *D00, u32 *D01, u32 *D02, u32 *D03, u32 *D04, u32 *D05, u32 *D06, u32 *D07, u32 *D08, u32 *D09, u32 *D10, u32 *D11, u32 *D12, u32 *D13, u32 *D14, u32 *D15, u32 *D16, u32 *D17, u32 *D18, u32 *D19, u32 *D20, u32 *D21, u32 *D22, u32 *D23, u32 *D24, u32 *D25, u32 *D26, u32 *D27, u32 *D28, u32 *D29, u32 *D30, u32 *D31, u32 *D32, u32 *D33, u32 *D34, u32 *D35, u32 *D36, u32 *D37, u32 *D38, u32 *D39, u32 *D40, u32 *D41, u32 *D42, u32 *D43, u32 *D44, u32 *D45, u32 *D46, u32 *D47, u32 *D48, u32 *D49, u32 *D50, u32 *D51, u32 *D52, u32 *D53, u32 *D54, u32 *D55, u32 *D56, u32 *D57, u32 *D58, u32 *D59, u32 *D60, u32 *D61, u32 *D62, u32 *D63) { - sXXX_DECL u32 s001 = (0x001 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s002 = (0x002 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s004 = (0x004 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s008 = (0x008 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s010 = (0x010 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s020 = (0x020 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s040 = (0x040 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s080 = (0x080 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s100 = (0x100 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s200 = (0x200 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s400 = (0x400 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s800 = (0x800 & DESCRYPT_SALT) ? 0xffffffff : 0; + sXXX_DECL u32 s001 = (0x001 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s002 = (0x002 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s004 = (0x004 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s008 = (0x008 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s010 = (0x010 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s020 = (0x020 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s040 = (0x040 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s080 = (0x080 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s100 = (0x100 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s200 = (0x200 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s400 = (0x400 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s800 = (0x800 & DESCRYPT_SALT) ? 1 : 0; KXX_DECL u32 k00, k01, k02, k03, k04, k05; KXX_DECL u32 k06, k07, k08, k09, k10, k11; @@ -1474,60 +1562,6 @@ void DESCrypt (const u32 SALT, const u32 K00, const u32 K01, const u32 K02, cons for (u32 ii = 0; ii < 25; ii++) { - #if defined IS_AMD || defined IS_GENERIC - - #ifdef _unroll - #pragma unroll - #endif - for (u32 i = 0; i < 8; i++) - { - switch (i) - { - case 0: KEYSET00; break; - case 1: KEYSET02; break; - case 2: KEYSET04; break; - case 3: KEYSET06; break; - case 4: KEYSET10; break; - case 5: KEYSET12; break; - case 6: KEYSET14; break; - case 7: KEYSET16; break; - } - - s1(myselx (*D63, *D47, s001) ^ k00, myselx (*D32, *D48, s002) ^ k01, myselx (*D33, *D49, s004) ^ k02, myselx (*D34, *D50, s008) ^ k03, myselx (*D35, *D51, s010) ^ k04, myselx (*D36, *D52, s020) ^ k05, D08, D16, D22, D30); - s2(myselx (*D35, *D51, s040) ^ k06, myselx (*D36, *D52, s080) ^ k07, myselx (*D37, *D53, s100) ^ k08, myselx (*D38, *D54, s200) ^ k09, myselx (*D39, *D55, s400) ^ k10, myselx (*D40, *D56, s800) ^ k11, D12, D27, D01, D17); - s3( *D39 ^ k12, *D40 ^ k13, *D41 ^ k14, *D42 ^ k15, *D43 ^ k16, *D44 ^ k17, D23, D15, D29, D05); - s4( *D43 ^ k18, *D44 ^ k19, *D45 ^ k20, *D46 ^ k21, *D47 ^ k22, *D48 ^ k23, D25, D19, D09, D00); - s5(myselx (*D47, *D63, s001) ^ k24, myselx (*D48, *D32, s002) ^ k25, myselx (*D49, *D33, s004) ^ k26, myselx (*D50, *D34, s008) ^ k27, myselx (*D51, *D35, s010) ^ k28, myselx (*D52, *D36, s020) ^ k29, D07, D13, D24, D02); - s6(myselx (*D51, *D35, s040) ^ k30, myselx (*D52, *D36, s080) ^ k31, myselx (*D53, *D37, s100) ^ k32, myselx (*D54, *D38, s200) ^ k33, myselx (*D55, *D39, s400) ^ k34, myselx (*D56, *D40, s800) ^ k35, D03, D28, D10, D18); - s7( *D55 ^ k36, *D56 ^ k37, *D57 ^ k38, *D58 ^ k39, *D59 ^ k40, *D60 ^ k41, D31, D11, D21, D06); - s8( *D59 ^ k42, *D60 ^ k43, *D61 ^ k44, *D62 ^ k45, *D63 ^ k46, *D32 ^ k47, D04, D26, D14, D20); - - switch (i) - { - case 0: KEYSET01; break; - case 1: KEYSET03; break; - case 2: KEYSET05; break; - case 3: KEYSET07; break; - case 4: KEYSET11; break; - case 5: KEYSET13; break; - case 6: KEYSET15; break; - case 7: KEYSET17; break; - } - - s1(myselx (*D31, *D15, s001) ^ k00, myselx (*D00, *D16, s002) ^ k01, myselx (*D01, *D17, s004) ^ k02, myselx (*D02, *D18, s008) ^ k03, myselx (*D03, *D19, s010) ^ k04, myselx (*D04, *D20, s020) ^ k05, D40, D48, D54, D62); - s2(myselx (*D03, *D19, s040) ^ k06, myselx (*D04, *D20, s080) ^ k07, myselx (*D05, *D21, s100) ^ k08, myselx (*D06, *D22, s200) ^ k09, myselx (*D07, *D23, s400) ^ k10, myselx (*D08, *D24, s800) ^ k11, D44, D59, D33, D49); - s3( *D07 ^ k12, *D08 ^ k13, *D09 ^ k14, *D10 ^ k15, *D11 ^ k16, *D12 ^ k17, D55, D47, D61, D37); - s4( *D11 ^ k18, *D12 ^ k19, *D13 ^ k20, *D14 ^ k21, *D15 ^ k22, *D16 ^ k23, D57, D51, D41, D32); - s5(myselx (*D15, *D31, s001) ^ k24, myselx (*D16, *D00, s002) ^ k25, myselx (*D17, *D01, s004) ^ k26, myselx (*D18, *D02, s008) ^ k27, myselx (*D19, *D03, s010) ^ k28, myselx (*D20, *D04, s020) ^ k29, D39, D45, D56, D34); - s6(myselx (*D19, *D03, s040) ^ k30, myselx (*D20, *D04, s080) ^ k31, myselx (*D21, *D05, s100) ^ k32, myselx (*D22, *D06, s200) ^ k33, myselx (*D23, *D07, s400) ^ k34, myselx (*D24, *D08, s800) ^ k35, D35, D60, D42, D50); - s7( *D23 ^ k36, *D24 ^ k37, *D25 ^ k38, *D26 ^ k39, *D27 ^ k40, *D28 ^ k41, D63, D43, D53, D38); - s8( *D27 ^ k42, *D28 ^ k43, *D29 ^ k44, *D30 ^ k45, *D31 ^ k46, *D00 ^ k47, D36, D58, D46, D52); - } - - #endif - - #if defined IS_NV - #ifdef _unroll #pragma unroll #endif @@ -1622,8 +1656,6 @@ void DESCrypt (const u32 SALT, const u32 K00, const u32 K01, const u32 K02, cons s8( *D27 ^ k42, *D28 ^ k43, *D29 ^ k44, *D30 ^ k45, *D31 ^ k46, *D00 ^ k47, D36, D58, D46, D52); } - #endif - DATASWAP; } diff --git a/OpenCL/m02501.cl b/OpenCL/m02501.cl index d506bc389..dbfd507ba 100644 --- a/OpenCL/m02501.cl +++ b/OpenCL/m02501.cl @@ -17,12 +17,12 @@ #define COMPARE_S "inc_comp_single.cl" #define COMPARE_M "inc_comp_multi.cl" -inline u8 hex_convert (const u8 c) +u8 hex_convert (const u8 c) { return (c & 15) + (c >> 6) * 9; } -inline u8 hex_to_u8 (const u8 hex[2]) +u8 hex_to_u8 (const u8 hex[2]) { u8 v = 0; diff --git a/OpenCL/m03000_a3.cl b/OpenCL/m03000_a3.cl index 9817c3828..26dea4196 100644 --- a/OpenCL/m03000_a3.cl +++ b/OpenCL/m03000_a3.cl @@ -19,7 +19,7 @@ #endif #ifdef IS_AMD -#define KXX_DECL volatile +#define KXX_DECL #endif #ifdef IS_GENERIC @@ -898,11 +898,11 @@ void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, c #if defined IS_AMD || defined IS_GENERIC /* - * Bitslice DES S-boxes making use of a vector conditional select operation - * (e.g., vsel on PowerPC with AltiVec). + * Bitslice DES S-boxes for x86 with MMX/SSE2/AVX and for typical RISC + * architectures. These use AND, OR, XOR, NOT, and AND-NOT gates. * - * Gate counts: 36 33 33 26 35 34 34 32 - * Average: 32.875 + * Gate counts: 49 44 46 33 48 46 46 41 + * Average: 44.125 * * Several same-gate-count expressions for each S-box are included (for use on * different CPUs/GPUs). @@ -921,469 +921,556 @@ void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, c * The effort has been sponsored by Rapid7: http://www.rapid7.com */ -#define vnot(d,a) (d) = ~(a) -#define vor(d,a,b) (d) = (a) | (b) -#define vxor(d,a,b) (d) = (a) ^ (b) -#define vsel(d,a,b,c) (d) = bitselect ((a), (b), (c)) - void s1 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0F0F3333, x3C3C3C3C, x55FF55FF, x69C369C3, x0903B73F, x09FCB7C0, x5CA9E295; - u32 x55AFD1B7, x3C3C69C3, x6993B874; - u32 x5CEDE59F, x09FCE295, x5D91A51E, x529E962D; - u32 x29EEADC0, x4B8771A3, x428679F3, x6B68D433; - u32 x5BA7E193, x026F12F3, x6B27C493, x94D83B6C; - u32 x965E0B0F, x3327A113, x847F0A1F, xD6E19C32; - u32 x0DBCE883, x3A25A215, x37994A96; - u32 xC9C93B62, x89490F02, xB96C2D16; - u32 x0, x1, x2, x3; + u32 x55005500, x5A0F5A0F, x3333FFFF, x66666666, x22226666, x2D2D6969, + x25202160; + u32 x00FFFF00, x33CCCC33, x4803120C, x2222FFFF, x6A21EDF3, x4A01CC93; + u32 x5555FFFF, x7F75FFFF, x00D20096, x7FA7FF69; + u32 x0A0A0000, x0AD80096, x00999900, x0AD99996; + u32 x22332233, x257AA5F0, x054885C0, xFAB77A3F, x2221EDF3, xD89697CC; + u32 x05B77AC0, x05F77AD6, x36C48529, x6391D07C, xBB0747B0; + u32 x4C460000, x4EDF9996, x2D4E49EA, xBBFFFFB0, x96B1B65A; + u32 x5AFF5AFF, x52B11215, x4201C010, x10B0D205; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0F0F3333, a3, a2, a5); - vxor(x3C3C3C3C, a2, a3); - vor(x55FF55FF, a1, a4); - vxor(x69C369C3, x3C3C3C3C, x55FF55FF); - vsel(x0903B73F, a5, x0F0F3333, x69C369C3); - vxor(x09FCB7C0, a4, x0903B73F); - vxor(x5CA9E295, a1, x09FCB7C0); + x55005500 = a1 & ~a5; + x5A0F5A0F = a4 ^ x55005500; + x3333FFFF = a3 | a6; + x66666666 = a1 ^ a3; + x22226666 = x3333FFFF & x66666666; + x2D2D6969 = a4 ^ x22226666; + x25202160 = x2D2D6969 & ~x5A0F5A0F; - vsel(x55AFD1B7, x5CA9E295, x55FF55FF, x0F0F3333); - vsel(x3C3C69C3, x3C3C3C3C, x69C369C3, a5); - vxor(x6993B874, x55AFD1B7, x3C3C69C3); + x00FFFF00 = a5 ^ a6; + x33CCCC33 = a3 ^ x00FFFF00; + x4803120C = x5A0F5A0F & ~x33CCCC33; + x2222FFFF = a6 | x22226666; + x6A21EDF3 = x4803120C ^ x2222FFFF; + x4A01CC93 = x6A21EDF3 & ~x25202160; - vsel(x5CEDE59F, x55FF55FF, x5CA9E295, x6993B874); - vsel(x09FCE295, x09FCB7C0, x5CA9E295, a5); - vsel(x5D91A51E, x5CEDE59F, x6993B874, x09FCE295); - vxor(x529E962D, x0F0F3333, x5D91A51E); + x5555FFFF = a1 | a6; + x7F75FFFF = x6A21EDF3 | x5555FFFF; + x00D20096 = a5 & ~x2D2D6969; + x7FA7FF69 = x7F75FFFF ^ x00D20096; - vsel(x29EEADC0, x69C369C3, x09FCB7C0, x5CEDE59F); - vsel(x4B8771A3, x0F0F3333, x69C369C3, x5CA9E295); - vsel(x428679F3, a5, x4B8771A3, x529E962D); - vxor(x6B68D433, x29EEADC0, x428679F3); + x0A0A0000 = a4 & ~x5555FFFF; + x0AD80096 = x00D20096 ^ x0A0A0000; + x00999900 = x00FFFF00 & ~x66666666; + x0AD99996 = x0AD80096 | x00999900; - vsel(x5BA7E193, x5CA9E295, x4B8771A3, a3); - vsel(x026F12F3, a4, x0F0F3333, x529E962D); - vsel(x6B27C493, x6B68D433, x5BA7E193, x026F12F3); - vnot(x94D83B6C, x6B27C493); - vsel(x0, x94D83B6C, x6B68D433, a6); - vxor(*out1, *out1, x0); + x22332233 = a3 & ~x55005500; + x257AA5F0 = x5A0F5A0F ^ x7F75FFFF; + x054885C0 = x257AA5F0 & ~x22332233; + xFAB77A3F = ~x054885C0; + x2221EDF3 = x3333FFFF & x6A21EDF3; + xD89697CC = xFAB77A3F ^ x2221EDF3; + x20 = x7FA7FF69 & ~a2; + x21 = x20 ^ xD89697CC; + *out3 ^= x21; - vsel(x965E0B0F, x94D83B6C, a3, x428679F3); - vsel(x3327A113, x5BA7E193, a2, x69C369C3); - vsel(x847F0A1F, x965E0B0F, a4, x3327A113); - vxor(xD6E19C32, x529E962D, x847F0A1F); - vsel(x1, xD6E19C32, x5CA9E295, a6); - vxor(*out2, *out2, x1); + x05B77AC0 = x00FFFF00 ^ x054885C0; + x05F77AD6 = x00D20096 | x05B77AC0; + x36C48529 = x3333FFFF ^ x05F77AD6; + x6391D07C = a1 ^ x36C48529; + xBB0747B0 = xD89697CC ^ x6391D07C; + x00 = x25202160 | a2; + x01 = x00 ^ xBB0747B0; + *out1 ^= x01; - vsel(x0DBCE883, x09FCE295, x3C3C69C3, x847F0A1F); - vsel(x3A25A215, x3327A113, x5CA9E295, x0903B73F); - vxor(x37994A96, x0DBCE883, x3A25A215); - vsel(x3, x37994A96, x529E962D, a6); - vxor(*out4, *out4, x3); + x4C460000 = x3333FFFF ^ x7F75FFFF; + x4EDF9996 = x0AD99996 | x4C460000; + x2D4E49EA = x6391D07C ^ x4EDF9996; + xBBFFFFB0 = x00FFFF00 | xBB0747B0; + x96B1B65A = x2D4E49EA ^ xBBFFFFB0; + x10 = x4A01CC93 | a2; + x11 = x10 ^ x96B1B65A; + *out2 ^= x11; - vsel(xC9C93B62, x94D83B6C, x69C369C3, x5D91A51E); - vsel(x89490F02, a3, xC9C93B62, x965E0B0F); - vsel(xB96C2D16, x89490F02, x3C3C3C3C, x3A25A215); - vsel(x2, xB96C2D16, x6993B874, a6); - vxor(*out3, *out3, x2); + x5AFF5AFF = a5 | x5A0F5A0F; + x52B11215 = x5AFF5AFF & ~x2D4E49EA; + x4201C010 = x4A01CC93 & x6391D07C; + x10B0D205 = x52B11215 ^ x4201C010; + x30 = x10B0D205 | a2; + x31 = x30 ^ x0AD99996; + *out4 ^= x31; } void s2 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x55553333, x0055FF33, x33270F03, x66725A56, x00FFFF00, x668DA556; - u32 x0F0F5A56, xF0F0A5A9, xA5A5969A, xA55A699A; - u32 x0F5AF03C, x6600FF56, x87A5F09C; - u32 xA55A963C, x3C69C30F, xB44BC32D; - u32 x66D7CC56, x0F4B0F2D, x699CC37B, x996C66D2; - u32 xB46C662D, x278DB412, xB66CB43B; - u32 xD2DC4E52, x27993333, xD2994E33; - u32 x278D0F2D, x2E0E547B, x09976748; - u32 x0, x1, x2, x3; + u32 x33CC33CC; + u32 x55550000, x00AA00FF, x33BB33FF; + u32 x33CC0000, x11441144, x11BB11BB, x003311BB; + u32 x00000F0F, x336600FF, x332200FF, x332200F0; + u32 x0302000F, xAAAAAAAA, xA9A8AAA5, x33CCCC33, x33CCC030, x9A646A95; + u32 x00333303, x118822B8, xA8208805, x3CC3C33C, x94E34B39; + u32 x0331330C, x3FF3F33C, xA9DF596A, xA9DF5F6F, x962CAC53; + u32 xA9466A6A, x3DA52153, x29850143, x33C0330C, x1A45324F; + u32 x0A451047, xBBDFDD7B, xB19ACD3C; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x55553333, a1, a3, a6); - vsel(x0055FF33, a6, x55553333, a5); - vsel(x33270F03, a3, a4, x0055FF33); - vxor(x66725A56, a1, x33270F03); - vxor(x00FFFF00, a5, a6); - vxor(x668DA556, x66725A56, x00FFFF00); + x33CC33CC = a2 ^ a5; - vsel(x0F0F5A56, a4, x66725A56, a6); - vnot(xF0F0A5A9, x0F0F5A56); - vxor(xA5A5969A, x55553333, xF0F0A5A9); - vxor(xA55A699A, x00FFFF00, xA5A5969A); - vsel(x1, xA55A699A, x668DA556, a2); - vxor(*out2, *out2, x1); + x55550000 = a1 & ~a6; + x00AA00FF = a5 & ~x55550000; + x33BB33FF = a2 | x00AA00FF; - vxor(x0F5AF03C, a4, x0055FF33); - vsel(x6600FF56, x66725A56, a6, x00FFFF00); - vsel(x87A5F09C, xA5A5969A, x0F5AF03C, x6600FF56); + x33CC0000 = x33CC33CC & ~a6; + x11441144 = a1 & x33CC33CC; + x11BB11BB = a5 ^ x11441144; + x003311BB = x11BB11BB & ~x33CC0000; - vsel(xA55A963C, xA5A5969A, x0F5AF03C, a5); - vxor(x3C69C30F, a3, x0F5AF03C); - vsel(xB44BC32D, xA55A963C, x3C69C30F, a1); + x00000F0F = a3 & a6; + x336600FF = x00AA00FF ^ x33CC0000; + x332200FF = x33BB33FF & x336600FF; + x332200F0 = x332200FF & ~x00000F0F; - vsel(x66D7CC56, x66725A56, x668DA556, xA5A5969A); - vsel(x0F4B0F2D, a4, xB44BC32D, a5); - vxor(x699CC37B, x66D7CC56, x0F4B0F2D); - vxor(x996C66D2, xF0F0A5A9, x699CC37B); - vsel(x0, x996C66D2, xB44BC32D, a2); - vxor(*out1, *out1, x0); + x0302000F = a3 & x332200FF; + xAAAAAAAA = ~a1; + xA9A8AAA5 = x0302000F ^ xAAAAAAAA; + x33CCCC33 = a6 ^ x33CC33CC; + x33CCC030 = x33CCCC33 & ~x00000F0F; + x9A646A95 = xA9A8AAA5 ^ x33CCC030; + x10 = a4 & ~x332200F0; + x11 = x10 ^ x9A646A95; + *out2 ^= x11; - vsel(xB46C662D, xB44BC32D, x996C66D2, x00FFFF00); - vsel(x278DB412, x668DA556, xA5A5969A, a1); - vsel(xB66CB43B, xB46C662D, x278DB412, x6600FF56); + x00333303 = a2 & ~x33CCC030; + x118822B8 = x11BB11BB ^ x00333303; + xA8208805 = xA9A8AAA5 & ~x118822B8; + x3CC3C33C = a3 ^ x33CCCC33; + x94E34B39 = xA8208805 ^ x3CC3C33C; + x00 = x33BB33FF & ~a4; + x01 = x00 ^ x94E34B39; + *out1 ^= x01; - vsel(xD2DC4E52, x66D7CC56, x996C66D2, xB44BC32D); - vsel(x27993333, x278DB412, a3, x0055FF33); - vsel(xD2994E33, xD2DC4E52, x27993333, a5); - vsel(x3, x87A5F09C, xD2994E33, a2); - vxor(*out4, *out4, x3); + x0331330C = x0302000F ^ x00333303; + x3FF3F33C = x3CC3C33C | x0331330C; + xA9DF596A = x33BB33FF ^ x9A646A95; + xA9DF5F6F = x00000F0F | xA9DF596A; + x962CAC53 = x3FF3F33C ^ xA9DF5F6F; - vsel(x278D0F2D, x278DB412, x0F4B0F2D, a6); - vsel(x2E0E547B, x0F0F5A56, xB66CB43B, x278D0F2D); - vxor(x09976748, x27993333, x2E0E547B); - vsel(x2, xB66CB43B, x09976748, a2); - vxor(*out3, *out3, x2); + xA9466A6A = x332200FF ^ x9A646A95; + x3DA52153 = x94E34B39 ^ xA9466A6A; + x29850143 = xA9DF5F6F & x3DA52153; + x33C0330C = x33CC33CC & x3FF3F33C; + x1A45324F = x29850143 ^ x33C0330C; + x20 = x1A45324F | a4; + x21 = x20 ^ x962CAC53; + *out3 ^= x21; + + x0A451047 = x1A45324F & ~x118822B8; + xBBDFDD7B = x33CCCC33 | xA9DF596A; + xB19ACD3C = x0A451047 ^ xBBDFDD7B; + x30 = x003311BB | a4; + x31 = x30 ^ xB19ACD3C; + *out4 ^= x31; } void s3 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0F330F33, x0F33F0CC, x5A66A599; - u32 x2111B7BB, x03FF3033, x05BB50EE, x074F201F, x265E97A4; - u32 x556BA09E, x665A93AC, x99A56C53; - u32 x25A1A797, x5713754C, x66559355, x47B135C6; - u32 x9A5A5C60, xD07AF8F8, x87698DB4, xE13C1EE1; - u32 x000CFFCF, x9A485CCE, x0521DDF4, x9E49915E; - u32 xD069F8B4, x030FF0C3, xD2699876; - u32 xD579DDF4, xD579F0C3, xB32C6396; - u32 x0, x1, x2, x3; + u32 x44444444, x0F0FF0F0, x4F4FF4F4, x00FFFF00, x00AAAA00, x4FE55EF4; + u32 x3C3CC3C3, x3C3C0000, x7373F4F4, x0C840A00; + u32 x00005EF4, x00FF5EFF, x00555455, x3C699796; + u32 x000FF000, x55AA55AA, x26D9A15E, x2FDFAF5F, x2FD00F5F; + u32 x55AAFFAA, x28410014, x000000FF, x000000CC, x284100D8; + u32 x204100D0, x3C3CC3FF, x1C3CC32F, x4969967A; + u32 x4CC44CC4, x40C040C0, xC3C33C3C, x9669C396, xD6A98356; + u32 xD6E9C3D6, x4CEEEEC4, x9A072D12, x001A000B, x9A1F2D1B; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0F330F33, a4, a3, a5); - vxor(x0F33F0CC, a6, x0F330F33); - vxor(x5A66A599, a2, x0F33F0CC); + x44444444 = a1 & ~a2; + x0F0FF0F0 = a3 ^ a6; + x4F4FF4F4 = x44444444 | x0F0FF0F0; + x00FFFF00 = a4 ^ a6; + x00AAAA00 = x00FFFF00 & ~a1; + x4FE55EF4 = x4F4FF4F4 ^ x00AAAA00; - vsel(x2111B7BB, a3, a6, x5A66A599); - vsel(x03FF3033, a5, a3, x0F33F0CC); - vsel(x05BB50EE, a5, x0F33F0CC, a2); - vsel(x074F201F, x03FF3033, a4, x05BB50EE); - vxor(x265E97A4, x2111B7BB, x074F201F); + x3C3CC3C3 = a2 ^ x0F0FF0F0; + x3C3C0000 = x3C3CC3C3 & ~a6; + x7373F4F4 = x4F4FF4F4 ^ x3C3C0000; + x0C840A00 = x4FE55EF4 & ~x7373F4F4; - vsel(x556BA09E, x5A66A599, x05BB50EE, a4); - vsel(x665A93AC, x556BA09E, x265E97A4, a3); - vnot(x99A56C53, x665A93AC); - vsel(x1, x265E97A4, x99A56C53, a1); - vxor(*out2, *out2, x1); + x00005EF4 = a6 & x4FE55EF4; + x00FF5EFF = a4 | x00005EF4; + x00555455 = a1 & x00FF5EFF; + x3C699796 = x3C3CC3C3 ^ x00555455; + x30 = x4FE55EF4 & ~a5; + x31 = x30 ^ x3C699796; + *out4 ^= x31; - vxor(x25A1A797, x03FF3033, x265E97A4); - vsel(x5713754C, a2, x0F33F0CC, x074F201F); - vsel(x66559355, x665A93AC, a2, a5); - vsel(x47B135C6, x25A1A797, x5713754C, x66559355); + x000FF000 = x0F0FF0F0 & x00FFFF00; + x55AA55AA = a1 ^ a4; + x26D9A15E = x7373F4F4 ^ x55AA55AA; + x2FDFAF5F = a3 | x26D9A15E; + x2FD00F5F = x2FDFAF5F & ~x000FF000; - vxor(x9A5A5C60, x03FF3033, x99A56C53); - vsel(xD07AF8F8, x9A5A5C60, x556BA09E, x5A66A599); - vxor(x87698DB4, x5713754C, xD07AF8F8); - vxor(xE13C1EE1, x66559355, x87698DB4); + x55AAFFAA = x00AAAA00 | x55AA55AA; + x28410014 = x3C699796 & ~x55AAFFAA; + x000000FF = a4 & a6; + x000000CC = x000000FF & ~a2; + x284100D8 = x28410014 ^ x000000CC; - vsel(x000CFFCF, a4, a6, x0F33F0CC); - vsel(x9A485CCE, x9A5A5C60, x000CFFCF, x05BB50EE); - vsel(x0521DDF4, x87698DB4, a6, x9A5A5C60); - vsel(x9E49915E, x9A485CCE, x66559355, x0521DDF4); - vsel(x0, x9E49915E, xE13C1EE1, a1); - vxor(*out1, *out1, x0); + x204100D0 = x7373F4F4 & x284100D8; + x3C3CC3FF = x3C3CC3C3 | x000000FF; + x1C3CC32F = x3C3CC3FF & ~x204100D0; + x4969967A = a1 ^ x1C3CC32F; + x10 = x2FD00F5F & a5; + x11 = x10 ^ x4969967A; + *out2 ^= x11; - vsel(xD069F8B4, xD07AF8F8, x87698DB4, a5); - vsel(x030FF0C3, x000CFFCF, x03FF3033, a4); - vsel(xD2699876, xD069F8B4, x9E49915E, x030FF0C3); - vsel(x3, x5A66A599, xD2699876, a1); - vxor(*out4, *out4, x3); + x4CC44CC4 = x4FE55EF4 & ~a2; + x40C040C0 = x4CC44CC4 & ~a3; + xC3C33C3C = ~x3C3CC3C3; + x9669C396 = x55AAFFAA ^ xC3C33C3C; + xD6A98356 = x40C040C0 ^ x9669C396; + x00 = a5 & ~x0C840A00; + x01 = x00 ^ xD6A98356; + *out1 ^= x01; - vsel(xD579DDF4, xD07AF8F8, a2, x5713754C); - vsel(xD579F0C3, xD579DDF4, x030FF0C3, a6); - vxor(xB32C6396, x66559355, xD579F0C3); - vsel(x2, xB32C6396, x47B135C6, a1); - vxor(*out3, *out3, x2); + xD6E9C3D6 = x40C040C0 | x9669C396; + x4CEEEEC4 = x00AAAA00 | x4CC44CC4; + x9A072D12 = xD6E9C3D6 ^ x4CEEEEC4; + x001A000B = a4 & ~x4FE55EF4; + x9A1F2D1B = x9A072D12 | x001A000B; + x20 = a5 & ~x284100D8; + x21 = x20 ^ x9A1F2D1B; + *out3 ^= x21; } void s4 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0505AFAF, x0555AF55, x0A5AA05A, x46566456, x0A0A5F5F, x0AF55FA0, - x0AF50F0F, x4CA36B59; - u32 xB35C94A6; - u32 x01BB23BB, x5050FAFA, xA31C26BE, xA91679E1; - u32 x56E9861E; - u32 x50E9FA1E, x0AF55F00, x827D9784, xD2946D9A; - u32 x31F720B3, x11FB21B3, x4712A7AD, x9586CA37; - u32 x0, x1, x2, x3; + u32 x5A5A5A5A, x0F0FF0F0; + u32 x33FF33FF, x33FFCC00, x0C0030F0, x0C0CC0C0, x0CF3C03F, x5EFBDA7F, + x52FBCA0F, x61C8F93C; + u32 x00C0C03C, x0F0F30C0, x3B92A366, x30908326, x3C90B3D6; + u32 x33CC33CC, x0C0CFFFF, x379E5C99, x04124C11, x56E9861E, xA91679E1; + u32 x9586CA37, x8402C833, x84C2C83F, xB35C94A6; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0505AFAF, a5, a3, a1); - vsel(x0555AF55, x0505AFAF, a1, a4); - vxor(x0A5AA05A, a3, x0555AF55); - vsel(x46566456, a1, x0A5AA05A, a2); - vsel(x0A0A5F5F, a3, a5, a1); - vxor(x0AF55FA0, a4, x0A0A5F5F); - vsel(x0AF50F0F, x0AF55FA0, a3, a5); - vxor(x4CA36B59, x46566456, x0AF50F0F); + x5A5A5A5A = a1 ^ a3; + x0F0FF0F0 = a3 ^ a5; + x33FF33FF = a2 | a4; + x33FFCC00 = a5 ^ x33FF33FF; + x0C0030F0 = x0F0FF0F0 & ~x33FFCC00; + x0C0CC0C0 = x0F0FF0F0 & ~a2; + x0CF3C03F = a4 ^ x0C0CC0C0; + x5EFBDA7F = x5A5A5A5A | x0CF3C03F; + x52FBCA0F = x5EFBDA7F & ~x0C0030F0; + x61C8F93C = a2 ^ x52FBCA0F; - vnot(xB35C94A6, x4CA36B59); + x00C0C03C = x0CF3C03F & x61C8F93C; + x0F0F30C0 = x0F0FF0F0 & ~x00C0C03C; + x3B92A366 = x5A5A5A5A ^ x61C8F93C; + x30908326 = x3B92A366 & ~x0F0F30C0; + x3C90B3D6 = x0C0030F0 ^ x30908326; - vsel(x01BB23BB, a4, a2, x0555AF55); - vxor(x5050FAFA, a1, x0505AFAF); - vsel(xA31C26BE, xB35C94A6, x01BB23BB, x5050FAFA); - vxor(xA91679E1, x0A0A5F5F, xA31C26BE); + x33CC33CC = a2 ^ a4; + x0C0CFFFF = a5 | x0C0CC0C0; + x379E5C99 = x3B92A366 ^ x0C0CFFFF; + x04124C11 = x379E5C99 & ~x33CC33CC; + x56E9861E = x52FBCA0F ^ x04124C11; + x00 = a6 & ~x3C90B3D6; + x01 = x00 ^ x56E9861E; + *out1 ^= x01; - vnot(x56E9861E, xA91679E1); + xA91679E1 = ~x56E9861E; + x10 = x3C90B3D6 & ~a6; + x11 = x10 ^ xA91679E1; + *out2 ^= x11; - vsel(x50E9FA1E, x5050FAFA, x56E9861E, a4); - vsel(x0AF55F00, x0AF50F0F, x0AF55FA0, x0A0A5F5F); - vsel(x827D9784, xB35C94A6, x0AF55F00, a2); - vxor(xD2946D9A, x50E9FA1E, x827D9784); - vsel(x2, xD2946D9A, x4CA36B59, a6); - vxor(*out3, *out3, x2); - vsel(x3, xB35C94A6, xD2946D9A, a6); - vxor(*out4, *out4, x3); + x9586CA37 = x3C90B3D6 ^ xA91679E1; + x8402C833 = x9586CA37 & ~x33CC33CC; + x84C2C83F = x00C0C03C | x8402C833; + xB35C94A6 = x379E5C99 ^ x84C2C83F; + x20 = x61C8F93C | a6; + x21 = x20 ^ xB35C94A6; + *out3 ^= x21; - vsel(x31F720B3, a2, a4, x0AF55FA0); - vsel(x11FB21B3, x01BB23BB, x31F720B3, x5050FAFA); - vxor(x4712A7AD, x56E9861E, x11FB21B3); - vxor(x9586CA37, xD2946D9A, x4712A7AD); - vsel(x0, x56E9861E, x9586CA37, a6); - vxor(*out1, *out1, x0); - vsel(x1, x9586CA37, xA91679E1, a6); - vxor(*out2, *out2, x1); + x30 = a6 & x61C8F93C; + x31 = x30 ^ xB35C94A6; + *out4 ^= x31; } void s5 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x550F550F, xAAF0AAF0, xA5F5A5F5, x96C696C6, x00FFFF00, x963969C6; - u32 x2E3C2E3C, xB73121F7, x1501DF0F, x00558A5F, x2E69A463; - u32 x0679ED42, x045157FD, xB32077FF, x9D49D39C; - u32 xAC81CFB2, xF72577AF, x5BA4B81D; - u32 x5BA477AF, x4895469F, x3A35273A, x1A35669A; - u32 x12E6283D, x9E47D3D4, x1A676AB4; - u32 x891556DF, xE5E77F82, x6CF2295D; - u32 x2E3CA5F5, x9697C1C6, x369CC1D6; - u32 x0, x1, x2, x3; + u32 x77777777, x77770000, x22225555, x11116666, x1F1F6F6F; + u32 x70700000, x43433333, x00430033, x55557777, x55167744, x5A19784B; + u32 x5A1987B4, x7A3BD7F5, x003B00F5, x221955A0, x05050707, x271C52A7; + u32 x2A2A82A0, x6969B193, x1FE06F90, x16804E00, xE97FB1FF; + u32 x43403302, x35CAED30, x37DEFFB7, x349ECCB5, x0B01234A; + u32 x101884B4, x0FF8EB24, x41413333, x4FF9FB37, x4FC2FBC2; + u32 x22222222, x16BCEE97, x0F080B04, x19B4E593; + u32 x5C5C5C5C, x4448184C, x2DDABE71, x6992A63D; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x550F550F, a1, a3, a5); - vnot(xAAF0AAF0, x550F550F); - vsel(xA5F5A5F5, xAAF0AAF0, a1, a3); - vxor(x96C696C6, a2, xA5F5A5F5); - vxor(x00FFFF00, a5, a6); - vxor(x963969C6, x96C696C6, x00FFFF00); + x77777777 = a1 | a3; + x77770000 = x77777777 & ~a6; + x22225555 = a1 ^ x77770000; + x11116666 = a3 ^ x22225555; + x1F1F6F6F = a4 | x11116666; - vsel(x2E3C2E3C, a3, xAAF0AAF0, a2); - vsel(xB73121F7, a2, x963969C6, x96C696C6); - vsel(x1501DF0F, a6, x550F550F, xB73121F7); - vsel(x00558A5F, x1501DF0F, a5, a1); - vxor(x2E69A463, x2E3C2E3C, x00558A5F); + x70700000 = x77770000 & ~a4; + x43433333 = a3 ^ x70700000; + x00430033 = a5 & x43433333; + x55557777 = a1 | x11116666; + x55167744 = x00430033 ^ x55557777; + x5A19784B = a4 ^ x55167744; - vsel(x0679ED42, x00FFFF00, x2E69A463, x96C696C6); - vsel(x045157FD, a6, a1, x0679ED42); - vsel(xB32077FF, xB73121F7, a6, x045157FD); - vxor(x9D49D39C, x2E69A463, xB32077FF); - vsel(x2, x9D49D39C, x2E69A463, a4); - vxor(*out3, *out3, x2); + x5A1987B4 = a6 ^ x5A19784B; + x7A3BD7F5 = x22225555 | x5A1987B4; + x003B00F5 = a5 & x7A3BD7F5; + x221955A0 = x22225555 ^ x003B00F5; + x05050707 = a4 & x55557777; + x271C52A7 = x221955A0 ^ x05050707; - vsel(xAC81CFB2, xAAF0AAF0, x1501DF0F, x0679ED42); - vsel(xF72577AF, xB32077FF, x550F550F, a1); - vxor(x5BA4B81D, xAC81CFB2, xF72577AF); - vsel(x1, x5BA4B81D, x963969C6, a4); - vxor(*out2, *out2, x1); + x2A2A82A0 = x7A3BD7F5 & ~a1; + x6969B193 = x43433333 ^ x2A2A82A0; + x1FE06F90 = a5 ^ x1F1F6F6F; + x16804E00 = x1FE06F90 & ~x6969B193; + xE97FB1FF = ~x16804E00; + x20 = xE97FB1FF & ~a2; + x21 = x20 ^ x5A19784B; + *out3 ^= x21; - vsel(x5BA477AF, x5BA4B81D, xF72577AF, a6); - vsel(x4895469F, x5BA477AF, x00558A5F, a2); - vsel(x3A35273A, x2E3C2E3C, a2, x963969C6); - vsel(x1A35669A, x4895469F, x3A35273A, x5BA4B81D); + x43403302 = x43433333 & ~x003B00F5; + x35CAED30 = x2A2A82A0 ^ x1FE06F90; + x37DEFFB7 = x271C52A7 | x35CAED30; + x349ECCB5 = x37DEFFB7 & ~x43403302; + x0B01234A = x1F1F6F6F & ~x349ECCB5; - vsel(x12E6283D, a5, x5BA4B81D, x963969C6); - vsel(x9E47D3D4, x96C696C6, x9D49D39C, xAC81CFB2); - vsel(x1A676AB4, x12E6283D, x9E47D3D4, x4895469F); + x101884B4 = x5A1987B4 & x349ECCB5; + x0FF8EB24 = x1FE06F90 ^ x101884B4; + x41413333 = x43433333 & x55557777; + x4FF9FB37 = x0FF8EB24 | x41413333; + x4FC2FBC2 = x003B00F5 ^ x4FF9FB37; + x30 = x4FC2FBC2 & a2; + x31 = x30 ^ x271C52A7; + *out4 ^= x31; - vsel(x891556DF, xB32077FF, x4895469F, x3A35273A); - vsel(xE5E77F82, xF72577AF, x00FFFF00, x12E6283D); - vxor(x6CF2295D, x891556DF, xE5E77F82); - vsel(x3, x1A35669A, x6CF2295D, a4); - vxor(*out4, *out4, x3); + x22222222 = a1 ^ x77777777; + x16BCEE97 = x349ECCB5 ^ x22222222; + x0F080B04 = a4 & x0FF8EB24; + x19B4E593 = x16BCEE97 ^ x0F080B04; + x00 = x0B01234A | a2; + x01 = x00 ^ x19B4E593; + *out1 ^= x01; - vsel(x2E3CA5F5, x2E3C2E3C, xA5F5A5F5, a6); - vsel(x9697C1C6, x96C696C6, x963969C6, x045157FD); - vsel(x369CC1D6, x2E3CA5F5, x9697C1C6, x5BA477AF); - vsel(x0, x369CC1D6, x1A676AB4, a4); - vxor(*out1, *out1, x0); + x5C5C5C5C = x1F1F6F6F ^ x43433333; + x4448184C = x5C5C5C5C & ~x19B4E593; + x2DDABE71 = x22225555 ^ x0FF8EB24; + x6992A63D = x4448184C ^ x2DDABE71; + x10 = x1F1F6F6F & a2; + x11 = x10 ^ x6992A63D; + *out2 ^= x11; } void s6 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x555500FF, x666633CC, x606F30CF, x353A659A, x353A9A65, xCAC5659A; - u32 x353A6565, x0A3F0A6F, x6C5939A3, x5963A3C6; - u32 x35FF659A, x3AF06A95, x05CF0A9F, x16E94A97; - u32 x86CD4C9B, x12E0FFFD, x942D9A67; - u32 x142956AB, x455D45DF, x1C3EE619; - u32 x2AEA70D5, x20CF7A9F, x3CF19C86, x69A49C79; - u32 x840DBB67, x6DA19C1E, x925E63E1; - u32 x9C3CA761, x257A75D5, xB946D2B4; - u32 x0, x1, x2, x3; + u32 x33CC33CC; + u32 x3333FFFF, x11115555, x22DD6699, x22DD9966, x00220099; + u32 x00551144, x33662277, x5A5A5A5A, x7B7E7A7F, x59A31CE6; + u32 x09030C06, x09030000, x336622FF, x3A6522FF; + u32 x484D494C, x0000B6B3, x0F0FB9BC, x00FC00F9, x0FFFB9FD; + u32 x5DF75DF7, x116600F7, x1E69B94B, x1668B94B; + u32 x7B7B7B7B, x411E5984, x1FFFFDFD, x5EE1A479; + u32 x3CB4DFD2, x004B002D, xB7B2B6B3, xCCC9CDC8, xCC82CDE5; + u32 x0055EEBB, x5A5AECE9, x0050ECA9, xC5CAC1CE, xC59A2D67; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x555500FF, a1, a4, a5); - vxor(x666633CC, a2, x555500FF); - vsel(x606F30CF, x666633CC, a4, a3); - vxor(x353A659A, a1, x606F30CF); - vxor(x353A9A65, a5, x353A659A); - vnot(xCAC5659A, x353A9A65); + x33CC33CC = a2 ^ a5; - vsel(x353A6565, x353A659A, x353A9A65, a4); - vsel(x0A3F0A6F, a3, a4, x353A6565); - vxor(x6C5939A3, x666633CC, x0A3F0A6F); - vxor(x5963A3C6, x353A9A65, x6C5939A3); + x3333FFFF = a2 | a6; + x11115555 = a1 & x3333FFFF; + x22DD6699 = x33CC33CC ^ x11115555; + x22DD9966 = a6 ^ x22DD6699; + x00220099 = a5 & ~x22DD9966; - vsel(x35FF659A, a4, x353A659A, x353A6565); - vxor(x3AF06A95, a3, x35FF659A); - vsel(x05CF0A9F, a4, a3, x353A9A65); - vsel(x16E94A97, x3AF06A95, x05CF0A9F, x6C5939A3); + x00551144 = a1 & x22DD9966; + x33662277 = a2 ^ x00551144; + x5A5A5A5A = a1 ^ a3; + x7B7E7A7F = x33662277 | x5A5A5A5A; + x59A31CE6 = x22DD6699 ^ x7B7E7A7F; - vsel(x86CD4C9B, xCAC5659A, x05CF0A9F, x6C5939A3); - vsel(x12E0FFFD, a5, x3AF06A95, x16E94A97); - vsel(x942D9A67, x86CD4C9B, x353A9A65, x12E0FFFD); - vsel(x0, xCAC5659A, x942D9A67, a6); - vxor(*out1, *out1, x0); + x09030C06 = a3 & x59A31CE6; + x09030000 = x09030C06 & ~a6; + x336622FF = x00220099 | x33662277; + x3A6522FF = x09030000 ^ x336622FF; + x30 = x3A6522FF & a4; + x31 = x30 ^ x59A31CE6; + *out4 ^= x31; - vsel(x142956AB, x353A659A, x942D9A67, a2); - vsel(x455D45DF, a1, x86CD4C9B, x142956AB); - vxor(x1C3EE619, x5963A3C6, x455D45DF); - vsel(x3, x5963A3C6, x1C3EE619, a6); - vxor(*out4, *out4, x3); + x484D494C = a2 ^ x7B7E7A7F; + x0000B6B3 = a6 & ~x484D494C; + x0F0FB9BC = a3 ^ x0000B6B3; + x00FC00F9 = a5 & ~x09030C06; + x0FFFB9FD = x0F0FB9BC | x00FC00F9; - vsel(x2AEA70D5, x3AF06A95, x606F30CF, x353A9A65); - vsel(x20CF7A9F, x2AEA70D5, x05CF0A9F, x0A3F0A6F); - vxor(x3CF19C86, x1C3EE619, x20CF7A9F); - vxor(x69A49C79, x555500FF, x3CF19C86); + x5DF75DF7 = a1 | x59A31CE6; + x116600F7 = x336622FF & x5DF75DF7; + x1E69B94B = x0F0FB9BC ^ x116600F7; + x1668B94B = x1E69B94B & ~x09030000; + x20 = x00220099 | a4; + x21 = x20 ^ x1668B94B; + *out3 ^= x21; - vsel(x840DBB67, a5, x942D9A67, x86CD4C9B); - vsel(x6DA19C1E, x69A49C79, x3CF19C86, x840DBB67); - vnot(x925E63E1, x6DA19C1E); - vsel(x1, x925E63E1, x69A49C79, a6); - vxor(*out2, *out2, x1); + x7B7B7B7B = a2 | x5A5A5A5A; + x411E5984 = x3A6522FF ^ x7B7B7B7B; + x1FFFFDFD = x11115555 | x0FFFB9FD; + x5EE1A479 = x411E5984 ^ x1FFFFDFD; - vsel(x9C3CA761, x840DBB67, x1C3EE619, x3CF19C86); - vsel(x257A75D5, x455D45DF, x2AEA70D5, x606F30CF); - vxor(xB946D2B4, x9C3CA761, x257A75D5); - vsel(x2, x16E94A97, xB946D2B4, a6); - vxor(*out3, *out3, x2); + x3CB4DFD2 = x22DD6699 ^ x1E69B94B; + x004B002D = a5 & ~x3CB4DFD2; + xB7B2B6B3 = ~x484D494C; + xCCC9CDC8 = x7B7B7B7B ^ xB7B2B6B3; + xCC82CDE5 = x004B002D ^ xCCC9CDC8; + x10 = xCC82CDE5 & ~a4; + x11 = x10 ^ x5EE1A479; + *out2 ^= x11; + + x0055EEBB = a6 ^ x00551144; + x5A5AECE9 = a1 ^ x0F0FB9BC; + x0050ECA9 = x0055EEBB & x5A5AECE9; + xC5CAC1CE = x09030C06 ^ xCCC9CDC8; + xC59A2D67 = x0050ECA9 ^ xC5CAC1CE; + x00 = x0FFFB9FD & ~a4; + x01 = x00 ^ xC59A2D67; + *out1 ^= x01; } void s7 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x44447777, x4B4B7878, x22772277, x0505F5F5, x220522F5, x694E5A8D; - u32 x00FFFF00, x66666666, x32353235, x26253636, x26DAC936; - u32 x738F9C63, x11EF9867, x26DA9867; - u32 x4B4B9C63, x4B666663, x4E639396; - u32 x4E4B393C, xFF00FF00, xFF05DD21, xB14EE41D; - u32 xD728827B, x6698807B, x699C585B; - u32 x778A8877, xA4A71E18, x74878E78; - u32 x204A5845, x74879639, x8B7869C6; - u32 x0, x1, x2, x3; + u32 x0FF00FF0, x3CC33CC3, x00003CC3, x0F000F00, x5A555A55, x00001841; + u32 x00000F00, x33333C33, x7B777E77, x0FF0F00F, x74878E78; + u32 x003C003C, x5A7D5A7D, x333300F0, x694E5A8D; + u32 x0FF0CCCC, x000F0303, x5A505854, x33CC000F, x699C585B; + u32 x7F878F78, x21101013, x7F979F7B, x30030CC0, x4F9493BB; + u32 x6F9CDBFB, x0000DBFB, x00005151, x26DAC936, x26DA9867; + u32 x27DA9877, x27DA438C, x2625C9C9, x27FFCBCD; + u32 x27FF1036, x27FF103E, xB06B6C44, x97947C7A; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x44447777, a2, a6, a3); - vxor(x4B4B7878, a4, x44447777); - vsel(x22772277, a3, a5, a2); - vsel(x0505F5F5, a6, a2, a4); - vsel(x220522F5, x22772277, x0505F5F5, a5); - vxor(x694E5A8D, x4B4B7878, x220522F5); + x0FF00FF0 = a4 ^ a5; + x3CC33CC3 = a3 ^ x0FF00FF0; + x00003CC3 = a6 & x3CC33CC3; + x0F000F00 = a4 & x0FF00FF0; + x5A555A55 = a2 ^ x0F000F00; + x00001841 = x00003CC3 & x5A555A55; - vxor(x00FFFF00, a5, a6); - vxor(x66666666, a2, a3); - vsel(x32353235, a3, x220522F5, a4); - vsel(x26253636, x66666666, x32353235, x4B4B7878); - vxor(x26DAC936, x00FFFF00, x26253636); - vsel(x0, x26DAC936, x694E5A8D, a1); - vxor(*out1, *out1, x0); + x00000F00 = a6 & x0F000F00; + x33333C33 = a3 ^ x00000F00; + x7B777E77 = x5A555A55 | x33333C33; + x0FF0F00F = a6 ^ x0FF00FF0; + x74878E78 = x7B777E77 ^ x0FF0F00F; + x30 = a1 & ~x00001841; + x31 = x30 ^ x74878E78; + *out4 ^= x31; - vxor(x738F9C63, a2, x26DAC936); - vsel(x11EF9867, x738F9C63, a5, x66666666); - vsel(x26DA9867, x26DAC936, x11EF9867, a6); + x003C003C = a5 & ~x3CC33CC3; + x5A7D5A7D = x5A555A55 | x003C003C; + x333300F0 = x00003CC3 ^ x33333C33; + x694E5A8D = x5A7D5A7D ^ x333300F0; - vsel(x4B4B9C63, x4B4B7878, x738F9C63, a6); - vsel(x4B666663, x4B4B9C63, x66666666, x00FFFF00); - vxor(x4E639396, x0505F5F5, x4B666663); + x0FF0CCCC = x00003CC3 ^ x0FF0F00F; + x000F0303 = a4 & ~x0FF0CCCC; + x5A505854 = x5A555A55 & ~x000F0303; + x33CC000F = a5 ^ x333300F0; + x699C585B = x5A505854 ^ x33CC000F; - vsel(x4E4B393C, x4B4B7878, x4E639396, a2); - vnot(xFF00FF00, a5); - vsel(xFF05DD21, xFF00FF00, x738F9C63, x32353235); - vxor(xB14EE41D, x4E4B393C, xFF05DD21); - vsel(x1, xB14EE41D, x26DA9867, a1); - vxor(*out2, *out2, x1); + x7F878F78 = x0F000F00 | x74878E78; + x21101013 = a3 & x699C585B; + x7F979F7B = x7F878F78 | x21101013; + x30030CC0 = x3CC33CC3 & ~x0FF0F00F; + x4F9493BB = x7F979F7B ^ x30030CC0; + x00 = x4F9493BB & ~a1; + x01 = x00 ^ x694E5A8D; + *out1 ^= x01; - vxor(xD728827B, x66666666, xB14EE41D); - vsel(x6698807B, x26DA9867, xD728827B, x4E4B393C); - vsel(x699C585B, x6698807B, x694E5A8D, xFF05DD21); - vsel(x2, x699C585B, x4E639396, a1); - vxor(*out3, *out3, x2); + x6F9CDBFB = x699C585B | x4F9493BB; + x0000DBFB = a6 & x6F9CDBFB; + x00005151 = a2 & x0000DBFB; + x26DAC936 = x694E5A8D ^ x4F9493BB; + x26DA9867 = x00005151 ^ x26DAC936; - vsel(x778A8877, x738F9C63, x26DAC936, x26253636); - vxor(xA4A71E18, x738F9C63, xD728827B); - vsel(x74878E78, x778A8877, xA4A71E18, a4); + x27DA9877 = x21101013 | x26DA9867; + x27DA438C = x0000DBFB ^ x27DA9877; + x2625C9C9 = a5 ^ x26DAC936; + x27FFCBCD = x27DA438C | x2625C9C9; + x20 = x27FFCBCD & a1; + x21 = x20 ^ x699C585B; + *out3 ^= x21; - vsel(x204A5845, x26DA9867, x694E5A8D, x26DAC936); - vsel(x74879639, x74878E78, a3, x204A5845); - vnot(x8B7869C6, x74879639); - vsel(x3, x74878E78, x8B7869C6, a1); - vxor(*out4, *out4, x3); + x27FF1036 = x0000DBFB ^ x27FFCBCD; + x27FF103E = x003C003C | x27FF1036; + xB06B6C44 = ~x4F9493BB; + x97947C7A = x27FF103E ^ xB06B6C44; + x10 = x97947C7A & ~a1; + x11 = x10 ^ x26DA9867; + *out2 ^= x11; } void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0505F5F5, x05FAF50A, x0F0F00FF, x22227777, x07DA807F, x34E9B34C; - u32 x00FFF00F, x0033FCCF, x5565B15C, x0C0C3F3F, x59698E63; - u32 x3001F74E, x30555745, x693CD926; - u32 x0C0CD926, x0C3F25E9, x38D696A5; - u32 xC729695A; - u32 x03D2117B, xC778395B, xCB471CB2; - u32 x5425B13F, x56B3803F, x919AE965; - u32 x17B3023F, x75555755, x62E6556A, xA59E6C31; - u32 x0, x1, x2, x3; + u32 x0C0C0C0C, x0000F0F0, x00FFF00F, x00555005, x00515001; + u32 x33000330, x77555775, x30303030, x3030CFCF, x30104745, x30555745; + u32 xFF000FF0, xCF1048B5, x080A080A, xC71A40BF, xCB164CB3; + u32 x9E4319E6, x000019E6, xF429738C, xF4296A6A, xC729695A; + u32 xC47C3D2F, xF77F3F3F, x9E43E619, x693CD926; + u32 xF719A695, xF4FF73FF, x03E6D56A, x56B3803F; + u32 xF700A600, x61008000, x03B7856B, x62B7056B; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0505F5F5, a5, a1, a3); - vxor(x05FAF50A, a4, x0505F5F5); - vsel(x0F0F00FF, a3, a4, a5); - vsel(x22227777, a2, a5, a1); - vsel(x07DA807F, x05FAF50A, x0F0F00FF, x22227777); - vxor(x34E9B34C, a2, x07DA807F); + x0C0C0C0C = a3 & ~a2; + x0000F0F0 = a5 & ~a3; + x00FFF00F = a4 ^ x0000F0F0; + x00555005 = a1 & x00FFF00F; + x00515001 = x00555005 & ~x0C0C0C0C; - vsel(x00FFF00F, x05FAF50A, a4, a3); - vsel(x0033FCCF, a5, x00FFF00F, a2); - vsel(x5565B15C, a1, x34E9B34C, x0033FCCF); - vsel(x0C0C3F3F, a3, a5, a2); - vxor(x59698E63, x5565B15C, x0C0C3F3F); + x33000330 = a2 & ~x00FFF00F; + x77555775 = a1 | x33000330; + x30303030 = a2 & ~a3; + x3030CFCF = a5 ^ x30303030; + x30104745 = x77555775 & x3030CFCF; + x30555745 = x00555005 | x30104745; - vsel(x3001F74E, x34E9B34C, a5, x05FAF50A); - vsel(x30555745, x3001F74E, a1, x00FFF00F); - vxor(x693CD926, x59698E63, x30555745); - vsel(x2, x693CD926, x59698E63, a6); - vxor(*out3, *out3, x2); + xFF000FF0 = ~x00FFF00F; + xCF1048B5 = x30104745 ^ xFF000FF0; + x080A080A = a3 & ~x77555775; + xC71A40BF = xCF1048B5 ^ x080A080A; + xCB164CB3 = x0C0C0C0C ^ xC71A40BF; + x10 = x00515001 | a6; + x11 = x10 ^ xCB164CB3; + *out2 ^= x11; - vsel(x0C0CD926, x0C0C3F3F, x693CD926, a5); - vxor(x0C3F25E9, x0033FCCF, x0C0CD926); - vxor(x38D696A5, x34E9B34C, x0C3F25E9); + x9E4319E6 = a1 ^ xCB164CB3; + x000019E6 = a5 & x9E4319E6; + xF429738C = a2 ^ xC71A40BF; + xF4296A6A = x000019E6 ^ xF429738C; + xC729695A = x33000330 ^ xF4296A6A; - vnot(xC729695A, x38D696A5); + xC47C3D2F = x30555745 ^ xF4296A6A; + xF77F3F3F = a2 | xC47C3D2F; + x9E43E619 = a5 ^ x9E4319E6; + x693CD926 = xF77F3F3F ^ x9E43E619; + x20 = x30555745 & a6; + x21 = x20 ^ x693CD926; + *out3 ^= x21; - vsel(x03D2117B, x07DA807F, a2, x0C0CD926); - vsel(xC778395B, xC729695A, x03D2117B, x30555745); - vxor(xCB471CB2, x0C3F25E9, xC778395B); - vsel(x1, xCB471CB2, x34E9B34C, a6); - vxor(*out2, *out2, x1); + xF719A695 = x3030CFCF ^ xC729695A; + xF4FF73FF = a4 | xF429738C; + x03E6D56A = xF719A695 ^ xF4FF73FF; + x56B3803F = a1 ^ x03E6D56A; + x30 = x56B3803F & a6; + x31 = x30 ^ xC729695A; + *out4 ^= x31; - vsel(x5425B13F, x5565B15C, x0C0C3F3F, x03D2117B); - vsel(x56B3803F, x07DA807F, x5425B13F, x59698E63); - vxor(x919AE965, xC729695A, x56B3803F); - vsel(x3, xC729695A, x919AE965, a6); - vxor(*out4, *out4, x3); - - vsel(x17B3023F, x07DA807F, a2, x59698E63); - vor(x75555755, a1, x30555745); - vxor(x62E6556A, x17B3023F, x75555755); - vxor(xA59E6C31, xC778395B, x62E6556A); - vsel(x0, xA59E6C31, x38D696A5, a6); - vxor(*out1, *out1, x0); + xF700A600 = xF719A695 & ~a4; + x61008000 = x693CD926 & xF700A600; + x03B7856B = x00515001 ^ x03E6D56A; + x62B7056B = x61008000 ^ x03B7856B; + x00 = x62B7056B | a6; + x01 = x00 ^ xC729695A; + *out1 ^= x01; } #endif @@ -1452,60 +1539,6 @@ void DES (const u32 K00, const u32 K01, const u32 K02, const u32 K03, const u32 KXX_DECL u32 k36, k37, k38, k39, k40, k41; KXX_DECL u32 k42, k43, k44, k45, k46, k47; - #if defined IS_AMD || defined IS_GENERIC - - #ifdef _unroll - #pragma unroll - #endif - for (u32 i = 0; i < 8; i++) - { - switch (i) - { - case 0: KEYSET00; break; - case 1: KEYSET02; break; - case 2: KEYSET04; break; - case 3: KEYSET06; break; - case 4: KEYSET10; break; - case 5: KEYSET12; break; - case 6: KEYSET14; break; - case 7: KEYSET16; break; - } - - s1(*D63 ^ k00, *D32 ^ k01, *D33 ^ k02, *D34 ^ k03, *D35 ^ k04, *D36 ^ k05, D08, D16, D22, D30); - s2(*D35 ^ k06, *D36 ^ k07, *D37 ^ k08, *D38 ^ k09, *D39 ^ k10, *D40 ^ k11, D12, D27, D01, D17); - s3(*D39 ^ k12, *D40 ^ k13, *D41 ^ k14, *D42 ^ k15, *D43 ^ k16, *D44 ^ k17, D23, D15, D29, D05); - s4(*D43 ^ k18, *D44 ^ k19, *D45 ^ k20, *D46 ^ k21, *D47 ^ k22, *D48 ^ k23, D25, D19, D09, D00); - s5(*D47 ^ k24, *D48 ^ k25, *D49 ^ k26, *D50 ^ k27, *D51 ^ k28, *D52 ^ k29, D07, D13, D24, D02); - s6(*D51 ^ k30, *D52 ^ k31, *D53 ^ k32, *D54 ^ k33, *D55 ^ k34, *D56 ^ k35, D03, D28, D10, D18); - s7(*D55 ^ k36, *D56 ^ k37, *D57 ^ k38, *D58 ^ k39, *D59 ^ k40, *D60 ^ k41, D31, D11, D21, D06); - s8(*D59 ^ k42, *D60 ^ k43, *D61 ^ k44, *D62 ^ k45, *D63 ^ k46, *D32 ^ k47, D04, D26, D14, D20); - - switch (i) - { - case 0: KEYSET01; break; - case 1: KEYSET03; break; - case 2: KEYSET05; break; - case 3: KEYSET07; break; - case 4: KEYSET11; break; - case 5: KEYSET13; break; - case 6: KEYSET15; break; - case 7: KEYSET17; break; - } - - s1(*D31 ^ k00, *D00 ^ k01, *D01 ^ k02, *D02 ^ k03, *D03 ^ k04, *D04 ^ k05, D40, D48, D54, D62); - s2(*D03 ^ k06, *D04 ^ k07, *D05 ^ k08, *D06 ^ k09, *D07 ^ k10, *D08 ^ k11, D44, D59, D33, D49); - s3(*D07 ^ k12, *D08 ^ k13, *D09 ^ k14, *D10 ^ k15, *D11 ^ k16, *D12 ^ k17, D55, D47, D61, D37); - s4(*D11 ^ k18, *D12 ^ k19, *D13 ^ k20, *D14 ^ k21, *D15 ^ k22, *D16 ^ k23, D57, D51, D41, D32); - s5(*D15 ^ k24, *D16 ^ k25, *D17 ^ k26, *D18 ^ k27, *D19 ^ k28, *D20 ^ k29, D39, D45, D56, D34); - s6(*D19 ^ k30, *D20 ^ k31, *D21 ^ k32, *D22 ^ k33, *D23 ^ k34, *D24 ^ k35, D35, D60, D42, D50); - s7(*D23 ^ k36, *D24 ^ k37, *D25 ^ k38, *D26 ^ k39, *D27 ^ k40, *D28 ^ k41, D63, D43, D53, D38); - s8(*D27 ^ k42, *D28 ^ k43, *D29 ^ k44, *D30 ^ k45, *D31 ^ k46, *D00 ^ k47, D36, D58, D46, D52); - } - - #endif - - #if defined IS_NV - #ifdef _unroll #pragma unroll #endif @@ -1599,8 +1632,6 @@ void DES (const u32 K00, const u32 K01, const u32 K02, const u32 K03, const u32 s7(*D23 ^ k36, *D24 ^ k37, *D25 ^ k38, *D26 ^ k39, *D27 ^ k40, *D28 ^ k41, D63, D43, D53, D38); s8(*D27 ^ k42, *D28 ^ k43, *D29 ^ k44, *D30 ^ k45, *D31 ^ k46, *D00 ^ k47, D36, D58, D46, D52); } - - #endif } void transpose32c (u32 data[32]) @@ -1694,7 +1725,7 @@ void transpose32c (u32 data[32]) swap (data[30], data[31], 1, 0x55555555); } -void m03000m (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global bs_word_t * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +void m03000m (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant bs_word_t * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * base @@ -2066,7 +2097,7 @@ void m03000m (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __glo } } -void m03000s (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global bs_word_t * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +void m03000s (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant bs_word_t * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * base @@ -2481,7 +2512,7 @@ __kernel void m03000_tm (__global u32 *mod, __global bs_word_t *words_buf_r) } } -__kernel void m03000_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global bs_word_t * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m03000_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant bs_word_t * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -2499,7 +2530,7 @@ __kernel void m03000_mxx (__global pw_t *pws, __global const kernel_rule_t *rule m03000m (pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); } -__kernel void m03000_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global bs_word_t * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m03000_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant bs_word_t * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base diff --git a/OpenCL/m14000_a3-optimized.cl b/OpenCL/m14000_a3-optimized.cl index 4a1b81cd3..2a33d1f60 100644 --- a/OpenCL/m14000_a3-optimized.cl +++ b/OpenCL/m14000_a3-optimized.cl @@ -19,7 +19,7 @@ #endif #ifdef IS_AMD -#define KXX_DECL volatile +#define KXX_DECL #endif #ifdef IS_GENERIC @@ -898,11 +898,11 @@ void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, c #if defined IS_AMD || defined IS_GENERIC /* - * Bitslice DES S-boxes making use of a vector conditional select operation - * (e.g., vsel on PowerPC with AltiVec). + * Bitslice DES S-boxes for x86 with MMX/SSE2/AVX and for typical RISC + * architectures. These use AND, OR, XOR, NOT, and AND-NOT gates. * - * Gate counts: 36 33 33 26 35 34 34 32 - * Average: 32.875 + * Gate counts: 49 44 46 33 48 46 46 41 + * Average: 44.125 * * Several same-gate-count expressions for each S-box are included (for use on * different CPUs/GPUs). @@ -921,469 +921,556 @@ void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, c * The effort has been sponsored by Rapid7: http://www.rapid7.com */ -#define vnot(d,a) (d) = ~(a) -#define vor(d,a,b) (d) = (a) | (b) -#define vxor(d,a,b) (d) = (a) ^ (b) -#define vsel(d,a,b,c) (d) = bitselect ((a), (b), (c)) - void s1 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0F0F3333, x3C3C3C3C, x55FF55FF, x69C369C3, x0903B73F, x09FCB7C0, x5CA9E295; - u32 x55AFD1B7, x3C3C69C3, x6993B874; - u32 x5CEDE59F, x09FCE295, x5D91A51E, x529E962D; - u32 x29EEADC0, x4B8771A3, x428679F3, x6B68D433; - u32 x5BA7E193, x026F12F3, x6B27C493, x94D83B6C; - u32 x965E0B0F, x3327A113, x847F0A1F, xD6E19C32; - u32 x0DBCE883, x3A25A215, x37994A96; - u32 xC9C93B62, x89490F02, xB96C2D16; - u32 x0, x1, x2, x3; + u32 x55005500, x5A0F5A0F, x3333FFFF, x66666666, x22226666, x2D2D6969, + x25202160; + u32 x00FFFF00, x33CCCC33, x4803120C, x2222FFFF, x6A21EDF3, x4A01CC93; + u32 x5555FFFF, x7F75FFFF, x00D20096, x7FA7FF69; + u32 x0A0A0000, x0AD80096, x00999900, x0AD99996; + u32 x22332233, x257AA5F0, x054885C0, xFAB77A3F, x2221EDF3, xD89697CC; + u32 x05B77AC0, x05F77AD6, x36C48529, x6391D07C, xBB0747B0; + u32 x4C460000, x4EDF9996, x2D4E49EA, xBBFFFFB0, x96B1B65A; + u32 x5AFF5AFF, x52B11215, x4201C010, x10B0D205; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0F0F3333, a3, a2, a5); - vxor(x3C3C3C3C, a2, a3); - vor(x55FF55FF, a1, a4); - vxor(x69C369C3, x3C3C3C3C, x55FF55FF); - vsel(x0903B73F, a5, x0F0F3333, x69C369C3); - vxor(x09FCB7C0, a4, x0903B73F); - vxor(x5CA9E295, a1, x09FCB7C0); + x55005500 = a1 & ~a5; + x5A0F5A0F = a4 ^ x55005500; + x3333FFFF = a3 | a6; + x66666666 = a1 ^ a3; + x22226666 = x3333FFFF & x66666666; + x2D2D6969 = a4 ^ x22226666; + x25202160 = x2D2D6969 & ~x5A0F5A0F; - vsel(x55AFD1B7, x5CA9E295, x55FF55FF, x0F0F3333); - vsel(x3C3C69C3, x3C3C3C3C, x69C369C3, a5); - vxor(x6993B874, x55AFD1B7, x3C3C69C3); + x00FFFF00 = a5 ^ a6; + x33CCCC33 = a3 ^ x00FFFF00; + x4803120C = x5A0F5A0F & ~x33CCCC33; + x2222FFFF = a6 | x22226666; + x6A21EDF3 = x4803120C ^ x2222FFFF; + x4A01CC93 = x6A21EDF3 & ~x25202160; - vsel(x5CEDE59F, x55FF55FF, x5CA9E295, x6993B874); - vsel(x09FCE295, x09FCB7C0, x5CA9E295, a5); - vsel(x5D91A51E, x5CEDE59F, x6993B874, x09FCE295); - vxor(x529E962D, x0F0F3333, x5D91A51E); + x5555FFFF = a1 | a6; + x7F75FFFF = x6A21EDF3 | x5555FFFF; + x00D20096 = a5 & ~x2D2D6969; + x7FA7FF69 = x7F75FFFF ^ x00D20096; - vsel(x29EEADC0, x69C369C3, x09FCB7C0, x5CEDE59F); - vsel(x4B8771A3, x0F0F3333, x69C369C3, x5CA9E295); - vsel(x428679F3, a5, x4B8771A3, x529E962D); - vxor(x6B68D433, x29EEADC0, x428679F3); + x0A0A0000 = a4 & ~x5555FFFF; + x0AD80096 = x00D20096 ^ x0A0A0000; + x00999900 = x00FFFF00 & ~x66666666; + x0AD99996 = x0AD80096 | x00999900; - vsel(x5BA7E193, x5CA9E295, x4B8771A3, a3); - vsel(x026F12F3, a4, x0F0F3333, x529E962D); - vsel(x6B27C493, x6B68D433, x5BA7E193, x026F12F3); - vnot(x94D83B6C, x6B27C493); - vsel(x0, x94D83B6C, x6B68D433, a6); - vxor(*out1, *out1, x0); + x22332233 = a3 & ~x55005500; + x257AA5F0 = x5A0F5A0F ^ x7F75FFFF; + x054885C0 = x257AA5F0 & ~x22332233; + xFAB77A3F = ~x054885C0; + x2221EDF3 = x3333FFFF & x6A21EDF3; + xD89697CC = xFAB77A3F ^ x2221EDF3; + x20 = x7FA7FF69 & ~a2; + x21 = x20 ^ xD89697CC; + *out3 ^= x21; - vsel(x965E0B0F, x94D83B6C, a3, x428679F3); - vsel(x3327A113, x5BA7E193, a2, x69C369C3); - vsel(x847F0A1F, x965E0B0F, a4, x3327A113); - vxor(xD6E19C32, x529E962D, x847F0A1F); - vsel(x1, xD6E19C32, x5CA9E295, a6); - vxor(*out2, *out2, x1); + x05B77AC0 = x00FFFF00 ^ x054885C0; + x05F77AD6 = x00D20096 | x05B77AC0; + x36C48529 = x3333FFFF ^ x05F77AD6; + x6391D07C = a1 ^ x36C48529; + xBB0747B0 = xD89697CC ^ x6391D07C; + x00 = x25202160 | a2; + x01 = x00 ^ xBB0747B0; + *out1 ^= x01; - vsel(x0DBCE883, x09FCE295, x3C3C69C3, x847F0A1F); - vsel(x3A25A215, x3327A113, x5CA9E295, x0903B73F); - vxor(x37994A96, x0DBCE883, x3A25A215); - vsel(x3, x37994A96, x529E962D, a6); - vxor(*out4, *out4, x3); + x4C460000 = x3333FFFF ^ x7F75FFFF; + x4EDF9996 = x0AD99996 | x4C460000; + x2D4E49EA = x6391D07C ^ x4EDF9996; + xBBFFFFB0 = x00FFFF00 | xBB0747B0; + x96B1B65A = x2D4E49EA ^ xBBFFFFB0; + x10 = x4A01CC93 | a2; + x11 = x10 ^ x96B1B65A; + *out2 ^= x11; - vsel(xC9C93B62, x94D83B6C, x69C369C3, x5D91A51E); - vsel(x89490F02, a3, xC9C93B62, x965E0B0F); - vsel(xB96C2D16, x89490F02, x3C3C3C3C, x3A25A215); - vsel(x2, xB96C2D16, x6993B874, a6); - vxor(*out3, *out3, x2); + x5AFF5AFF = a5 | x5A0F5A0F; + x52B11215 = x5AFF5AFF & ~x2D4E49EA; + x4201C010 = x4A01CC93 & x6391D07C; + x10B0D205 = x52B11215 ^ x4201C010; + x30 = x10B0D205 | a2; + x31 = x30 ^ x0AD99996; + *out4 ^= x31; } void s2 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x55553333, x0055FF33, x33270F03, x66725A56, x00FFFF00, x668DA556; - u32 x0F0F5A56, xF0F0A5A9, xA5A5969A, xA55A699A; - u32 x0F5AF03C, x6600FF56, x87A5F09C; - u32 xA55A963C, x3C69C30F, xB44BC32D; - u32 x66D7CC56, x0F4B0F2D, x699CC37B, x996C66D2; - u32 xB46C662D, x278DB412, xB66CB43B; - u32 xD2DC4E52, x27993333, xD2994E33; - u32 x278D0F2D, x2E0E547B, x09976748; - u32 x0, x1, x2, x3; + u32 x33CC33CC; + u32 x55550000, x00AA00FF, x33BB33FF; + u32 x33CC0000, x11441144, x11BB11BB, x003311BB; + u32 x00000F0F, x336600FF, x332200FF, x332200F0; + u32 x0302000F, xAAAAAAAA, xA9A8AAA5, x33CCCC33, x33CCC030, x9A646A95; + u32 x00333303, x118822B8, xA8208805, x3CC3C33C, x94E34B39; + u32 x0331330C, x3FF3F33C, xA9DF596A, xA9DF5F6F, x962CAC53; + u32 xA9466A6A, x3DA52153, x29850143, x33C0330C, x1A45324F; + u32 x0A451047, xBBDFDD7B, xB19ACD3C; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x55553333, a1, a3, a6); - vsel(x0055FF33, a6, x55553333, a5); - vsel(x33270F03, a3, a4, x0055FF33); - vxor(x66725A56, a1, x33270F03); - vxor(x00FFFF00, a5, a6); - vxor(x668DA556, x66725A56, x00FFFF00); + x33CC33CC = a2 ^ a5; - vsel(x0F0F5A56, a4, x66725A56, a6); - vnot(xF0F0A5A9, x0F0F5A56); - vxor(xA5A5969A, x55553333, xF0F0A5A9); - vxor(xA55A699A, x00FFFF00, xA5A5969A); - vsel(x1, xA55A699A, x668DA556, a2); - vxor(*out2, *out2, x1); + x55550000 = a1 & ~a6; + x00AA00FF = a5 & ~x55550000; + x33BB33FF = a2 | x00AA00FF; - vxor(x0F5AF03C, a4, x0055FF33); - vsel(x6600FF56, x66725A56, a6, x00FFFF00); - vsel(x87A5F09C, xA5A5969A, x0F5AF03C, x6600FF56); + x33CC0000 = x33CC33CC & ~a6; + x11441144 = a1 & x33CC33CC; + x11BB11BB = a5 ^ x11441144; + x003311BB = x11BB11BB & ~x33CC0000; - vsel(xA55A963C, xA5A5969A, x0F5AF03C, a5); - vxor(x3C69C30F, a3, x0F5AF03C); - vsel(xB44BC32D, xA55A963C, x3C69C30F, a1); + x00000F0F = a3 & a6; + x336600FF = x00AA00FF ^ x33CC0000; + x332200FF = x33BB33FF & x336600FF; + x332200F0 = x332200FF & ~x00000F0F; - vsel(x66D7CC56, x66725A56, x668DA556, xA5A5969A); - vsel(x0F4B0F2D, a4, xB44BC32D, a5); - vxor(x699CC37B, x66D7CC56, x0F4B0F2D); - vxor(x996C66D2, xF0F0A5A9, x699CC37B); - vsel(x0, x996C66D2, xB44BC32D, a2); - vxor(*out1, *out1, x0); + x0302000F = a3 & x332200FF; + xAAAAAAAA = ~a1; + xA9A8AAA5 = x0302000F ^ xAAAAAAAA; + x33CCCC33 = a6 ^ x33CC33CC; + x33CCC030 = x33CCCC33 & ~x00000F0F; + x9A646A95 = xA9A8AAA5 ^ x33CCC030; + x10 = a4 & ~x332200F0; + x11 = x10 ^ x9A646A95; + *out2 ^= x11; - vsel(xB46C662D, xB44BC32D, x996C66D2, x00FFFF00); - vsel(x278DB412, x668DA556, xA5A5969A, a1); - vsel(xB66CB43B, xB46C662D, x278DB412, x6600FF56); + x00333303 = a2 & ~x33CCC030; + x118822B8 = x11BB11BB ^ x00333303; + xA8208805 = xA9A8AAA5 & ~x118822B8; + x3CC3C33C = a3 ^ x33CCCC33; + x94E34B39 = xA8208805 ^ x3CC3C33C; + x00 = x33BB33FF & ~a4; + x01 = x00 ^ x94E34B39; + *out1 ^= x01; - vsel(xD2DC4E52, x66D7CC56, x996C66D2, xB44BC32D); - vsel(x27993333, x278DB412, a3, x0055FF33); - vsel(xD2994E33, xD2DC4E52, x27993333, a5); - vsel(x3, x87A5F09C, xD2994E33, a2); - vxor(*out4, *out4, x3); + x0331330C = x0302000F ^ x00333303; + x3FF3F33C = x3CC3C33C | x0331330C; + xA9DF596A = x33BB33FF ^ x9A646A95; + xA9DF5F6F = x00000F0F | xA9DF596A; + x962CAC53 = x3FF3F33C ^ xA9DF5F6F; - vsel(x278D0F2D, x278DB412, x0F4B0F2D, a6); - vsel(x2E0E547B, x0F0F5A56, xB66CB43B, x278D0F2D); - vxor(x09976748, x27993333, x2E0E547B); - vsel(x2, xB66CB43B, x09976748, a2); - vxor(*out3, *out3, x2); + xA9466A6A = x332200FF ^ x9A646A95; + x3DA52153 = x94E34B39 ^ xA9466A6A; + x29850143 = xA9DF5F6F & x3DA52153; + x33C0330C = x33CC33CC & x3FF3F33C; + x1A45324F = x29850143 ^ x33C0330C; + x20 = x1A45324F | a4; + x21 = x20 ^ x962CAC53; + *out3 ^= x21; + + x0A451047 = x1A45324F & ~x118822B8; + xBBDFDD7B = x33CCCC33 | xA9DF596A; + xB19ACD3C = x0A451047 ^ xBBDFDD7B; + x30 = x003311BB | a4; + x31 = x30 ^ xB19ACD3C; + *out4 ^= x31; } void s3 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0F330F33, x0F33F0CC, x5A66A599; - u32 x2111B7BB, x03FF3033, x05BB50EE, x074F201F, x265E97A4; - u32 x556BA09E, x665A93AC, x99A56C53; - u32 x25A1A797, x5713754C, x66559355, x47B135C6; - u32 x9A5A5C60, xD07AF8F8, x87698DB4, xE13C1EE1; - u32 x000CFFCF, x9A485CCE, x0521DDF4, x9E49915E; - u32 xD069F8B4, x030FF0C3, xD2699876; - u32 xD579DDF4, xD579F0C3, xB32C6396; - u32 x0, x1, x2, x3; + u32 x44444444, x0F0FF0F0, x4F4FF4F4, x00FFFF00, x00AAAA00, x4FE55EF4; + u32 x3C3CC3C3, x3C3C0000, x7373F4F4, x0C840A00; + u32 x00005EF4, x00FF5EFF, x00555455, x3C699796; + u32 x000FF000, x55AA55AA, x26D9A15E, x2FDFAF5F, x2FD00F5F; + u32 x55AAFFAA, x28410014, x000000FF, x000000CC, x284100D8; + u32 x204100D0, x3C3CC3FF, x1C3CC32F, x4969967A; + u32 x4CC44CC4, x40C040C0, xC3C33C3C, x9669C396, xD6A98356; + u32 xD6E9C3D6, x4CEEEEC4, x9A072D12, x001A000B, x9A1F2D1B; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0F330F33, a4, a3, a5); - vxor(x0F33F0CC, a6, x0F330F33); - vxor(x5A66A599, a2, x0F33F0CC); + x44444444 = a1 & ~a2; + x0F0FF0F0 = a3 ^ a6; + x4F4FF4F4 = x44444444 | x0F0FF0F0; + x00FFFF00 = a4 ^ a6; + x00AAAA00 = x00FFFF00 & ~a1; + x4FE55EF4 = x4F4FF4F4 ^ x00AAAA00; - vsel(x2111B7BB, a3, a6, x5A66A599); - vsel(x03FF3033, a5, a3, x0F33F0CC); - vsel(x05BB50EE, a5, x0F33F0CC, a2); - vsel(x074F201F, x03FF3033, a4, x05BB50EE); - vxor(x265E97A4, x2111B7BB, x074F201F); + x3C3CC3C3 = a2 ^ x0F0FF0F0; + x3C3C0000 = x3C3CC3C3 & ~a6; + x7373F4F4 = x4F4FF4F4 ^ x3C3C0000; + x0C840A00 = x4FE55EF4 & ~x7373F4F4; - vsel(x556BA09E, x5A66A599, x05BB50EE, a4); - vsel(x665A93AC, x556BA09E, x265E97A4, a3); - vnot(x99A56C53, x665A93AC); - vsel(x1, x265E97A4, x99A56C53, a1); - vxor(*out2, *out2, x1); + x00005EF4 = a6 & x4FE55EF4; + x00FF5EFF = a4 | x00005EF4; + x00555455 = a1 & x00FF5EFF; + x3C699796 = x3C3CC3C3 ^ x00555455; + x30 = x4FE55EF4 & ~a5; + x31 = x30 ^ x3C699796; + *out4 ^= x31; - vxor(x25A1A797, x03FF3033, x265E97A4); - vsel(x5713754C, a2, x0F33F0CC, x074F201F); - vsel(x66559355, x665A93AC, a2, a5); - vsel(x47B135C6, x25A1A797, x5713754C, x66559355); + x000FF000 = x0F0FF0F0 & x00FFFF00; + x55AA55AA = a1 ^ a4; + x26D9A15E = x7373F4F4 ^ x55AA55AA; + x2FDFAF5F = a3 | x26D9A15E; + x2FD00F5F = x2FDFAF5F & ~x000FF000; - vxor(x9A5A5C60, x03FF3033, x99A56C53); - vsel(xD07AF8F8, x9A5A5C60, x556BA09E, x5A66A599); - vxor(x87698DB4, x5713754C, xD07AF8F8); - vxor(xE13C1EE1, x66559355, x87698DB4); + x55AAFFAA = x00AAAA00 | x55AA55AA; + x28410014 = x3C699796 & ~x55AAFFAA; + x000000FF = a4 & a6; + x000000CC = x000000FF & ~a2; + x284100D8 = x28410014 ^ x000000CC; - vsel(x000CFFCF, a4, a6, x0F33F0CC); - vsel(x9A485CCE, x9A5A5C60, x000CFFCF, x05BB50EE); - vsel(x0521DDF4, x87698DB4, a6, x9A5A5C60); - vsel(x9E49915E, x9A485CCE, x66559355, x0521DDF4); - vsel(x0, x9E49915E, xE13C1EE1, a1); - vxor(*out1, *out1, x0); + x204100D0 = x7373F4F4 & x284100D8; + x3C3CC3FF = x3C3CC3C3 | x000000FF; + x1C3CC32F = x3C3CC3FF & ~x204100D0; + x4969967A = a1 ^ x1C3CC32F; + x10 = x2FD00F5F & a5; + x11 = x10 ^ x4969967A; + *out2 ^= x11; - vsel(xD069F8B4, xD07AF8F8, x87698DB4, a5); - vsel(x030FF0C3, x000CFFCF, x03FF3033, a4); - vsel(xD2699876, xD069F8B4, x9E49915E, x030FF0C3); - vsel(x3, x5A66A599, xD2699876, a1); - vxor(*out4, *out4, x3); + x4CC44CC4 = x4FE55EF4 & ~a2; + x40C040C0 = x4CC44CC4 & ~a3; + xC3C33C3C = ~x3C3CC3C3; + x9669C396 = x55AAFFAA ^ xC3C33C3C; + xD6A98356 = x40C040C0 ^ x9669C396; + x00 = a5 & ~x0C840A00; + x01 = x00 ^ xD6A98356; + *out1 ^= x01; - vsel(xD579DDF4, xD07AF8F8, a2, x5713754C); - vsel(xD579F0C3, xD579DDF4, x030FF0C3, a6); - vxor(xB32C6396, x66559355, xD579F0C3); - vsel(x2, xB32C6396, x47B135C6, a1); - vxor(*out3, *out3, x2); + xD6E9C3D6 = x40C040C0 | x9669C396; + x4CEEEEC4 = x00AAAA00 | x4CC44CC4; + x9A072D12 = xD6E9C3D6 ^ x4CEEEEC4; + x001A000B = a4 & ~x4FE55EF4; + x9A1F2D1B = x9A072D12 | x001A000B; + x20 = a5 & ~x284100D8; + x21 = x20 ^ x9A1F2D1B; + *out3 ^= x21; } void s4 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0505AFAF, x0555AF55, x0A5AA05A, x46566456, x0A0A5F5F, x0AF55FA0, - x0AF50F0F, x4CA36B59; - u32 xB35C94A6; - u32 x01BB23BB, x5050FAFA, xA31C26BE, xA91679E1; - u32 x56E9861E; - u32 x50E9FA1E, x0AF55F00, x827D9784, xD2946D9A; - u32 x31F720B3, x11FB21B3, x4712A7AD, x9586CA37; - u32 x0, x1, x2, x3; + u32 x5A5A5A5A, x0F0FF0F0; + u32 x33FF33FF, x33FFCC00, x0C0030F0, x0C0CC0C0, x0CF3C03F, x5EFBDA7F, + x52FBCA0F, x61C8F93C; + u32 x00C0C03C, x0F0F30C0, x3B92A366, x30908326, x3C90B3D6; + u32 x33CC33CC, x0C0CFFFF, x379E5C99, x04124C11, x56E9861E, xA91679E1; + u32 x9586CA37, x8402C833, x84C2C83F, xB35C94A6; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0505AFAF, a5, a3, a1); - vsel(x0555AF55, x0505AFAF, a1, a4); - vxor(x0A5AA05A, a3, x0555AF55); - vsel(x46566456, a1, x0A5AA05A, a2); - vsel(x0A0A5F5F, a3, a5, a1); - vxor(x0AF55FA0, a4, x0A0A5F5F); - vsel(x0AF50F0F, x0AF55FA0, a3, a5); - vxor(x4CA36B59, x46566456, x0AF50F0F); + x5A5A5A5A = a1 ^ a3; + x0F0FF0F0 = a3 ^ a5; + x33FF33FF = a2 | a4; + x33FFCC00 = a5 ^ x33FF33FF; + x0C0030F0 = x0F0FF0F0 & ~x33FFCC00; + x0C0CC0C0 = x0F0FF0F0 & ~a2; + x0CF3C03F = a4 ^ x0C0CC0C0; + x5EFBDA7F = x5A5A5A5A | x0CF3C03F; + x52FBCA0F = x5EFBDA7F & ~x0C0030F0; + x61C8F93C = a2 ^ x52FBCA0F; - vnot(xB35C94A6, x4CA36B59); + x00C0C03C = x0CF3C03F & x61C8F93C; + x0F0F30C0 = x0F0FF0F0 & ~x00C0C03C; + x3B92A366 = x5A5A5A5A ^ x61C8F93C; + x30908326 = x3B92A366 & ~x0F0F30C0; + x3C90B3D6 = x0C0030F0 ^ x30908326; - vsel(x01BB23BB, a4, a2, x0555AF55); - vxor(x5050FAFA, a1, x0505AFAF); - vsel(xA31C26BE, xB35C94A6, x01BB23BB, x5050FAFA); - vxor(xA91679E1, x0A0A5F5F, xA31C26BE); + x33CC33CC = a2 ^ a4; + x0C0CFFFF = a5 | x0C0CC0C0; + x379E5C99 = x3B92A366 ^ x0C0CFFFF; + x04124C11 = x379E5C99 & ~x33CC33CC; + x56E9861E = x52FBCA0F ^ x04124C11; + x00 = a6 & ~x3C90B3D6; + x01 = x00 ^ x56E9861E; + *out1 ^= x01; - vnot(x56E9861E, xA91679E1); + xA91679E1 = ~x56E9861E; + x10 = x3C90B3D6 & ~a6; + x11 = x10 ^ xA91679E1; + *out2 ^= x11; - vsel(x50E9FA1E, x5050FAFA, x56E9861E, a4); - vsel(x0AF55F00, x0AF50F0F, x0AF55FA0, x0A0A5F5F); - vsel(x827D9784, xB35C94A6, x0AF55F00, a2); - vxor(xD2946D9A, x50E9FA1E, x827D9784); - vsel(x2, xD2946D9A, x4CA36B59, a6); - vxor(*out3, *out3, x2); - vsel(x3, xB35C94A6, xD2946D9A, a6); - vxor(*out4, *out4, x3); + x9586CA37 = x3C90B3D6 ^ xA91679E1; + x8402C833 = x9586CA37 & ~x33CC33CC; + x84C2C83F = x00C0C03C | x8402C833; + xB35C94A6 = x379E5C99 ^ x84C2C83F; + x20 = x61C8F93C | a6; + x21 = x20 ^ xB35C94A6; + *out3 ^= x21; - vsel(x31F720B3, a2, a4, x0AF55FA0); - vsel(x11FB21B3, x01BB23BB, x31F720B3, x5050FAFA); - vxor(x4712A7AD, x56E9861E, x11FB21B3); - vxor(x9586CA37, xD2946D9A, x4712A7AD); - vsel(x0, x56E9861E, x9586CA37, a6); - vxor(*out1, *out1, x0); - vsel(x1, x9586CA37, xA91679E1, a6); - vxor(*out2, *out2, x1); + x30 = a6 & x61C8F93C; + x31 = x30 ^ xB35C94A6; + *out4 ^= x31; } void s5 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x550F550F, xAAF0AAF0, xA5F5A5F5, x96C696C6, x00FFFF00, x963969C6; - u32 x2E3C2E3C, xB73121F7, x1501DF0F, x00558A5F, x2E69A463; - u32 x0679ED42, x045157FD, xB32077FF, x9D49D39C; - u32 xAC81CFB2, xF72577AF, x5BA4B81D; - u32 x5BA477AF, x4895469F, x3A35273A, x1A35669A; - u32 x12E6283D, x9E47D3D4, x1A676AB4; - u32 x891556DF, xE5E77F82, x6CF2295D; - u32 x2E3CA5F5, x9697C1C6, x369CC1D6; - u32 x0, x1, x2, x3; + u32 x77777777, x77770000, x22225555, x11116666, x1F1F6F6F; + u32 x70700000, x43433333, x00430033, x55557777, x55167744, x5A19784B; + u32 x5A1987B4, x7A3BD7F5, x003B00F5, x221955A0, x05050707, x271C52A7; + u32 x2A2A82A0, x6969B193, x1FE06F90, x16804E00, xE97FB1FF; + u32 x43403302, x35CAED30, x37DEFFB7, x349ECCB5, x0B01234A; + u32 x101884B4, x0FF8EB24, x41413333, x4FF9FB37, x4FC2FBC2; + u32 x22222222, x16BCEE97, x0F080B04, x19B4E593; + u32 x5C5C5C5C, x4448184C, x2DDABE71, x6992A63D; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x550F550F, a1, a3, a5); - vnot(xAAF0AAF0, x550F550F); - vsel(xA5F5A5F5, xAAF0AAF0, a1, a3); - vxor(x96C696C6, a2, xA5F5A5F5); - vxor(x00FFFF00, a5, a6); - vxor(x963969C6, x96C696C6, x00FFFF00); + x77777777 = a1 | a3; + x77770000 = x77777777 & ~a6; + x22225555 = a1 ^ x77770000; + x11116666 = a3 ^ x22225555; + x1F1F6F6F = a4 | x11116666; - vsel(x2E3C2E3C, a3, xAAF0AAF0, a2); - vsel(xB73121F7, a2, x963969C6, x96C696C6); - vsel(x1501DF0F, a6, x550F550F, xB73121F7); - vsel(x00558A5F, x1501DF0F, a5, a1); - vxor(x2E69A463, x2E3C2E3C, x00558A5F); + x70700000 = x77770000 & ~a4; + x43433333 = a3 ^ x70700000; + x00430033 = a5 & x43433333; + x55557777 = a1 | x11116666; + x55167744 = x00430033 ^ x55557777; + x5A19784B = a4 ^ x55167744; - vsel(x0679ED42, x00FFFF00, x2E69A463, x96C696C6); - vsel(x045157FD, a6, a1, x0679ED42); - vsel(xB32077FF, xB73121F7, a6, x045157FD); - vxor(x9D49D39C, x2E69A463, xB32077FF); - vsel(x2, x9D49D39C, x2E69A463, a4); - vxor(*out3, *out3, x2); + x5A1987B4 = a6 ^ x5A19784B; + x7A3BD7F5 = x22225555 | x5A1987B4; + x003B00F5 = a5 & x7A3BD7F5; + x221955A0 = x22225555 ^ x003B00F5; + x05050707 = a4 & x55557777; + x271C52A7 = x221955A0 ^ x05050707; - vsel(xAC81CFB2, xAAF0AAF0, x1501DF0F, x0679ED42); - vsel(xF72577AF, xB32077FF, x550F550F, a1); - vxor(x5BA4B81D, xAC81CFB2, xF72577AF); - vsel(x1, x5BA4B81D, x963969C6, a4); - vxor(*out2, *out2, x1); + x2A2A82A0 = x7A3BD7F5 & ~a1; + x6969B193 = x43433333 ^ x2A2A82A0; + x1FE06F90 = a5 ^ x1F1F6F6F; + x16804E00 = x1FE06F90 & ~x6969B193; + xE97FB1FF = ~x16804E00; + x20 = xE97FB1FF & ~a2; + x21 = x20 ^ x5A19784B; + *out3 ^= x21; - vsel(x5BA477AF, x5BA4B81D, xF72577AF, a6); - vsel(x4895469F, x5BA477AF, x00558A5F, a2); - vsel(x3A35273A, x2E3C2E3C, a2, x963969C6); - vsel(x1A35669A, x4895469F, x3A35273A, x5BA4B81D); + x43403302 = x43433333 & ~x003B00F5; + x35CAED30 = x2A2A82A0 ^ x1FE06F90; + x37DEFFB7 = x271C52A7 | x35CAED30; + x349ECCB5 = x37DEFFB7 & ~x43403302; + x0B01234A = x1F1F6F6F & ~x349ECCB5; - vsel(x12E6283D, a5, x5BA4B81D, x963969C6); - vsel(x9E47D3D4, x96C696C6, x9D49D39C, xAC81CFB2); - vsel(x1A676AB4, x12E6283D, x9E47D3D4, x4895469F); + x101884B4 = x5A1987B4 & x349ECCB5; + x0FF8EB24 = x1FE06F90 ^ x101884B4; + x41413333 = x43433333 & x55557777; + x4FF9FB37 = x0FF8EB24 | x41413333; + x4FC2FBC2 = x003B00F5 ^ x4FF9FB37; + x30 = x4FC2FBC2 & a2; + x31 = x30 ^ x271C52A7; + *out4 ^= x31; - vsel(x891556DF, xB32077FF, x4895469F, x3A35273A); - vsel(xE5E77F82, xF72577AF, x00FFFF00, x12E6283D); - vxor(x6CF2295D, x891556DF, xE5E77F82); - vsel(x3, x1A35669A, x6CF2295D, a4); - vxor(*out4, *out4, x3); + x22222222 = a1 ^ x77777777; + x16BCEE97 = x349ECCB5 ^ x22222222; + x0F080B04 = a4 & x0FF8EB24; + x19B4E593 = x16BCEE97 ^ x0F080B04; + x00 = x0B01234A | a2; + x01 = x00 ^ x19B4E593; + *out1 ^= x01; - vsel(x2E3CA5F5, x2E3C2E3C, xA5F5A5F5, a6); - vsel(x9697C1C6, x96C696C6, x963969C6, x045157FD); - vsel(x369CC1D6, x2E3CA5F5, x9697C1C6, x5BA477AF); - vsel(x0, x369CC1D6, x1A676AB4, a4); - vxor(*out1, *out1, x0); + x5C5C5C5C = x1F1F6F6F ^ x43433333; + x4448184C = x5C5C5C5C & ~x19B4E593; + x2DDABE71 = x22225555 ^ x0FF8EB24; + x6992A63D = x4448184C ^ x2DDABE71; + x10 = x1F1F6F6F & a2; + x11 = x10 ^ x6992A63D; + *out2 ^= x11; } void s6 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x555500FF, x666633CC, x606F30CF, x353A659A, x353A9A65, xCAC5659A; - u32 x353A6565, x0A3F0A6F, x6C5939A3, x5963A3C6; - u32 x35FF659A, x3AF06A95, x05CF0A9F, x16E94A97; - u32 x86CD4C9B, x12E0FFFD, x942D9A67; - u32 x142956AB, x455D45DF, x1C3EE619; - u32 x2AEA70D5, x20CF7A9F, x3CF19C86, x69A49C79; - u32 x840DBB67, x6DA19C1E, x925E63E1; - u32 x9C3CA761, x257A75D5, xB946D2B4; - u32 x0, x1, x2, x3; + u32 x33CC33CC; + u32 x3333FFFF, x11115555, x22DD6699, x22DD9966, x00220099; + u32 x00551144, x33662277, x5A5A5A5A, x7B7E7A7F, x59A31CE6; + u32 x09030C06, x09030000, x336622FF, x3A6522FF; + u32 x484D494C, x0000B6B3, x0F0FB9BC, x00FC00F9, x0FFFB9FD; + u32 x5DF75DF7, x116600F7, x1E69B94B, x1668B94B; + u32 x7B7B7B7B, x411E5984, x1FFFFDFD, x5EE1A479; + u32 x3CB4DFD2, x004B002D, xB7B2B6B3, xCCC9CDC8, xCC82CDE5; + u32 x0055EEBB, x5A5AECE9, x0050ECA9, xC5CAC1CE, xC59A2D67; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x555500FF, a1, a4, a5); - vxor(x666633CC, a2, x555500FF); - vsel(x606F30CF, x666633CC, a4, a3); - vxor(x353A659A, a1, x606F30CF); - vxor(x353A9A65, a5, x353A659A); - vnot(xCAC5659A, x353A9A65); + x33CC33CC = a2 ^ a5; - vsel(x353A6565, x353A659A, x353A9A65, a4); - vsel(x0A3F0A6F, a3, a4, x353A6565); - vxor(x6C5939A3, x666633CC, x0A3F0A6F); - vxor(x5963A3C6, x353A9A65, x6C5939A3); + x3333FFFF = a2 | a6; + x11115555 = a1 & x3333FFFF; + x22DD6699 = x33CC33CC ^ x11115555; + x22DD9966 = a6 ^ x22DD6699; + x00220099 = a5 & ~x22DD9966; - vsel(x35FF659A, a4, x353A659A, x353A6565); - vxor(x3AF06A95, a3, x35FF659A); - vsel(x05CF0A9F, a4, a3, x353A9A65); - vsel(x16E94A97, x3AF06A95, x05CF0A9F, x6C5939A3); + x00551144 = a1 & x22DD9966; + x33662277 = a2 ^ x00551144; + x5A5A5A5A = a1 ^ a3; + x7B7E7A7F = x33662277 | x5A5A5A5A; + x59A31CE6 = x22DD6699 ^ x7B7E7A7F; - vsel(x86CD4C9B, xCAC5659A, x05CF0A9F, x6C5939A3); - vsel(x12E0FFFD, a5, x3AF06A95, x16E94A97); - vsel(x942D9A67, x86CD4C9B, x353A9A65, x12E0FFFD); - vsel(x0, xCAC5659A, x942D9A67, a6); - vxor(*out1, *out1, x0); + x09030C06 = a3 & x59A31CE6; + x09030000 = x09030C06 & ~a6; + x336622FF = x00220099 | x33662277; + x3A6522FF = x09030000 ^ x336622FF; + x30 = x3A6522FF & a4; + x31 = x30 ^ x59A31CE6; + *out4 ^= x31; - vsel(x142956AB, x353A659A, x942D9A67, a2); - vsel(x455D45DF, a1, x86CD4C9B, x142956AB); - vxor(x1C3EE619, x5963A3C6, x455D45DF); - vsel(x3, x5963A3C6, x1C3EE619, a6); - vxor(*out4, *out4, x3); + x484D494C = a2 ^ x7B7E7A7F; + x0000B6B3 = a6 & ~x484D494C; + x0F0FB9BC = a3 ^ x0000B6B3; + x00FC00F9 = a5 & ~x09030C06; + x0FFFB9FD = x0F0FB9BC | x00FC00F9; - vsel(x2AEA70D5, x3AF06A95, x606F30CF, x353A9A65); - vsel(x20CF7A9F, x2AEA70D5, x05CF0A9F, x0A3F0A6F); - vxor(x3CF19C86, x1C3EE619, x20CF7A9F); - vxor(x69A49C79, x555500FF, x3CF19C86); + x5DF75DF7 = a1 | x59A31CE6; + x116600F7 = x336622FF & x5DF75DF7; + x1E69B94B = x0F0FB9BC ^ x116600F7; + x1668B94B = x1E69B94B & ~x09030000; + x20 = x00220099 | a4; + x21 = x20 ^ x1668B94B; + *out3 ^= x21; - vsel(x840DBB67, a5, x942D9A67, x86CD4C9B); - vsel(x6DA19C1E, x69A49C79, x3CF19C86, x840DBB67); - vnot(x925E63E1, x6DA19C1E); - vsel(x1, x925E63E1, x69A49C79, a6); - vxor(*out2, *out2, x1); + x7B7B7B7B = a2 | x5A5A5A5A; + x411E5984 = x3A6522FF ^ x7B7B7B7B; + x1FFFFDFD = x11115555 | x0FFFB9FD; + x5EE1A479 = x411E5984 ^ x1FFFFDFD; - vsel(x9C3CA761, x840DBB67, x1C3EE619, x3CF19C86); - vsel(x257A75D5, x455D45DF, x2AEA70D5, x606F30CF); - vxor(xB946D2B4, x9C3CA761, x257A75D5); - vsel(x2, x16E94A97, xB946D2B4, a6); - vxor(*out3, *out3, x2); + x3CB4DFD2 = x22DD6699 ^ x1E69B94B; + x004B002D = a5 & ~x3CB4DFD2; + xB7B2B6B3 = ~x484D494C; + xCCC9CDC8 = x7B7B7B7B ^ xB7B2B6B3; + xCC82CDE5 = x004B002D ^ xCCC9CDC8; + x10 = xCC82CDE5 & ~a4; + x11 = x10 ^ x5EE1A479; + *out2 ^= x11; + + x0055EEBB = a6 ^ x00551144; + x5A5AECE9 = a1 ^ x0F0FB9BC; + x0050ECA9 = x0055EEBB & x5A5AECE9; + xC5CAC1CE = x09030C06 ^ xCCC9CDC8; + xC59A2D67 = x0050ECA9 ^ xC5CAC1CE; + x00 = x0FFFB9FD & ~a4; + x01 = x00 ^ xC59A2D67; + *out1 ^= x01; } void s7 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x44447777, x4B4B7878, x22772277, x0505F5F5, x220522F5, x694E5A8D; - u32 x00FFFF00, x66666666, x32353235, x26253636, x26DAC936; - u32 x738F9C63, x11EF9867, x26DA9867; - u32 x4B4B9C63, x4B666663, x4E639396; - u32 x4E4B393C, xFF00FF00, xFF05DD21, xB14EE41D; - u32 xD728827B, x6698807B, x699C585B; - u32 x778A8877, xA4A71E18, x74878E78; - u32 x204A5845, x74879639, x8B7869C6; - u32 x0, x1, x2, x3; + u32 x0FF00FF0, x3CC33CC3, x00003CC3, x0F000F00, x5A555A55, x00001841; + u32 x00000F00, x33333C33, x7B777E77, x0FF0F00F, x74878E78; + u32 x003C003C, x5A7D5A7D, x333300F0, x694E5A8D; + u32 x0FF0CCCC, x000F0303, x5A505854, x33CC000F, x699C585B; + u32 x7F878F78, x21101013, x7F979F7B, x30030CC0, x4F9493BB; + u32 x6F9CDBFB, x0000DBFB, x00005151, x26DAC936, x26DA9867; + u32 x27DA9877, x27DA438C, x2625C9C9, x27FFCBCD; + u32 x27FF1036, x27FF103E, xB06B6C44, x97947C7A; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x44447777, a2, a6, a3); - vxor(x4B4B7878, a4, x44447777); - vsel(x22772277, a3, a5, a2); - vsel(x0505F5F5, a6, a2, a4); - vsel(x220522F5, x22772277, x0505F5F5, a5); - vxor(x694E5A8D, x4B4B7878, x220522F5); + x0FF00FF0 = a4 ^ a5; + x3CC33CC3 = a3 ^ x0FF00FF0; + x00003CC3 = a6 & x3CC33CC3; + x0F000F00 = a4 & x0FF00FF0; + x5A555A55 = a2 ^ x0F000F00; + x00001841 = x00003CC3 & x5A555A55; - vxor(x00FFFF00, a5, a6); - vxor(x66666666, a2, a3); - vsel(x32353235, a3, x220522F5, a4); - vsel(x26253636, x66666666, x32353235, x4B4B7878); - vxor(x26DAC936, x00FFFF00, x26253636); - vsel(x0, x26DAC936, x694E5A8D, a1); - vxor(*out1, *out1, x0); + x00000F00 = a6 & x0F000F00; + x33333C33 = a3 ^ x00000F00; + x7B777E77 = x5A555A55 | x33333C33; + x0FF0F00F = a6 ^ x0FF00FF0; + x74878E78 = x7B777E77 ^ x0FF0F00F; + x30 = a1 & ~x00001841; + x31 = x30 ^ x74878E78; + *out4 ^= x31; - vxor(x738F9C63, a2, x26DAC936); - vsel(x11EF9867, x738F9C63, a5, x66666666); - vsel(x26DA9867, x26DAC936, x11EF9867, a6); + x003C003C = a5 & ~x3CC33CC3; + x5A7D5A7D = x5A555A55 | x003C003C; + x333300F0 = x00003CC3 ^ x33333C33; + x694E5A8D = x5A7D5A7D ^ x333300F0; - vsel(x4B4B9C63, x4B4B7878, x738F9C63, a6); - vsel(x4B666663, x4B4B9C63, x66666666, x00FFFF00); - vxor(x4E639396, x0505F5F5, x4B666663); + x0FF0CCCC = x00003CC3 ^ x0FF0F00F; + x000F0303 = a4 & ~x0FF0CCCC; + x5A505854 = x5A555A55 & ~x000F0303; + x33CC000F = a5 ^ x333300F0; + x699C585B = x5A505854 ^ x33CC000F; - vsel(x4E4B393C, x4B4B7878, x4E639396, a2); - vnot(xFF00FF00, a5); - vsel(xFF05DD21, xFF00FF00, x738F9C63, x32353235); - vxor(xB14EE41D, x4E4B393C, xFF05DD21); - vsel(x1, xB14EE41D, x26DA9867, a1); - vxor(*out2, *out2, x1); + x7F878F78 = x0F000F00 | x74878E78; + x21101013 = a3 & x699C585B; + x7F979F7B = x7F878F78 | x21101013; + x30030CC0 = x3CC33CC3 & ~x0FF0F00F; + x4F9493BB = x7F979F7B ^ x30030CC0; + x00 = x4F9493BB & ~a1; + x01 = x00 ^ x694E5A8D; + *out1 ^= x01; - vxor(xD728827B, x66666666, xB14EE41D); - vsel(x6698807B, x26DA9867, xD728827B, x4E4B393C); - vsel(x699C585B, x6698807B, x694E5A8D, xFF05DD21); - vsel(x2, x699C585B, x4E639396, a1); - vxor(*out3, *out3, x2); + x6F9CDBFB = x699C585B | x4F9493BB; + x0000DBFB = a6 & x6F9CDBFB; + x00005151 = a2 & x0000DBFB; + x26DAC936 = x694E5A8D ^ x4F9493BB; + x26DA9867 = x00005151 ^ x26DAC936; - vsel(x778A8877, x738F9C63, x26DAC936, x26253636); - vxor(xA4A71E18, x738F9C63, xD728827B); - vsel(x74878E78, x778A8877, xA4A71E18, a4); + x27DA9877 = x21101013 | x26DA9867; + x27DA438C = x0000DBFB ^ x27DA9877; + x2625C9C9 = a5 ^ x26DAC936; + x27FFCBCD = x27DA438C | x2625C9C9; + x20 = x27FFCBCD & a1; + x21 = x20 ^ x699C585B; + *out3 ^= x21; - vsel(x204A5845, x26DA9867, x694E5A8D, x26DAC936); - vsel(x74879639, x74878E78, a3, x204A5845); - vnot(x8B7869C6, x74879639); - vsel(x3, x74878E78, x8B7869C6, a1); - vxor(*out4, *out4, x3); + x27FF1036 = x0000DBFB ^ x27FFCBCD; + x27FF103E = x003C003C | x27FF1036; + xB06B6C44 = ~x4F9493BB; + x97947C7A = x27FF103E ^ xB06B6C44; + x10 = x97947C7A & ~a1; + x11 = x10 ^ x26DA9867; + *out2 ^= x11; } void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0505F5F5, x05FAF50A, x0F0F00FF, x22227777, x07DA807F, x34E9B34C; - u32 x00FFF00F, x0033FCCF, x5565B15C, x0C0C3F3F, x59698E63; - u32 x3001F74E, x30555745, x693CD926; - u32 x0C0CD926, x0C3F25E9, x38D696A5; - u32 xC729695A; - u32 x03D2117B, xC778395B, xCB471CB2; - u32 x5425B13F, x56B3803F, x919AE965; - u32 x17B3023F, x75555755, x62E6556A, xA59E6C31; - u32 x0, x1, x2, x3; + u32 x0C0C0C0C, x0000F0F0, x00FFF00F, x00555005, x00515001; + u32 x33000330, x77555775, x30303030, x3030CFCF, x30104745, x30555745; + u32 xFF000FF0, xCF1048B5, x080A080A, xC71A40BF, xCB164CB3; + u32 x9E4319E6, x000019E6, xF429738C, xF4296A6A, xC729695A; + u32 xC47C3D2F, xF77F3F3F, x9E43E619, x693CD926; + u32 xF719A695, xF4FF73FF, x03E6D56A, x56B3803F; + u32 xF700A600, x61008000, x03B7856B, x62B7056B; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0505F5F5, a5, a1, a3); - vxor(x05FAF50A, a4, x0505F5F5); - vsel(x0F0F00FF, a3, a4, a5); - vsel(x22227777, a2, a5, a1); - vsel(x07DA807F, x05FAF50A, x0F0F00FF, x22227777); - vxor(x34E9B34C, a2, x07DA807F); + x0C0C0C0C = a3 & ~a2; + x0000F0F0 = a5 & ~a3; + x00FFF00F = a4 ^ x0000F0F0; + x00555005 = a1 & x00FFF00F; + x00515001 = x00555005 & ~x0C0C0C0C; - vsel(x00FFF00F, x05FAF50A, a4, a3); - vsel(x0033FCCF, a5, x00FFF00F, a2); - vsel(x5565B15C, a1, x34E9B34C, x0033FCCF); - vsel(x0C0C3F3F, a3, a5, a2); - vxor(x59698E63, x5565B15C, x0C0C3F3F); + x33000330 = a2 & ~x00FFF00F; + x77555775 = a1 | x33000330; + x30303030 = a2 & ~a3; + x3030CFCF = a5 ^ x30303030; + x30104745 = x77555775 & x3030CFCF; + x30555745 = x00555005 | x30104745; - vsel(x3001F74E, x34E9B34C, a5, x05FAF50A); - vsel(x30555745, x3001F74E, a1, x00FFF00F); - vxor(x693CD926, x59698E63, x30555745); - vsel(x2, x693CD926, x59698E63, a6); - vxor(*out3, *out3, x2); + xFF000FF0 = ~x00FFF00F; + xCF1048B5 = x30104745 ^ xFF000FF0; + x080A080A = a3 & ~x77555775; + xC71A40BF = xCF1048B5 ^ x080A080A; + xCB164CB3 = x0C0C0C0C ^ xC71A40BF; + x10 = x00515001 | a6; + x11 = x10 ^ xCB164CB3; + *out2 ^= x11; - vsel(x0C0CD926, x0C0C3F3F, x693CD926, a5); - vxor(x0C3F25E9, x0033FCCF, x0C0CD926); - vxor(x38D696A5, x34E9B34C, x0C3F25E9); + x9E4319E6 = a1 ^ xCB164CB3; + x000019E6 = a5 & x9E4319E6; + xF429738C = a2 ^ xC71A40BF; + xF4296A6A = x000019E6 ^ xF429738C; + xC729695A = x33000330 ^ xF4296A6A; - vnot(xC729695A, x38D696A5); + xC47C3D2F = x30555745 ^ xF4296A6A; + xF77F3F3F = a2 | xC47C3D2F; + x9E43E619 = a5 ^ x9E4319E6; + x693CD926 = xF77F3F3F ^ x9E43E619; + x20 = x30555745 & a6; + x21 = x20 ^ x693CD926; + *out3 ^= x21; - vsel(x03D2117B, x07DA807F, a2, x0C0CD926); - vsel(xC778395B, xC729695A, x03D2117B, x30555745); - vxor(xCB471CB2, x0C3F25E9, xC778395B); - vsel(x1, xCB471CB2, x34E9B34C, a6); - vxor(*out2, *out2, x1); + xF719A695 = x3030CFCF ^ xC729695A; + xF4FF73FF = a4 | xF429738C; + x03E6D56A = xF719A695 ^ xF4FF73FF; + x56B3803F = a1 ^ x03E6D56A; + x30 = x56B3803F & a6; + x31 = x30 ^ xC729695A; + *out4 ^= x31; - vsel(x5425B13F, x5565B15C, x0C0C3F3F, x03D2117B); - vsel(x56B3803F, x07DA807F, x5425B13F, x59698E63); - vxor(x919AE965, xC729695A, x56B3803F); - vsel(x3, xC729695A, x919AE965, a6); - vxor(*out4, *out4, x3); - - vsel(x17B3023F, x07DA807F, a2, x59698E63); - vor(x75555755, a1, x30555745); - vxor(x62E6556A, x17B3023F, x75555755); - vxor(xA59E6C31, xC778395B, x62E6556A); - vsel(x0, xA59E6C31, x38D696A5, a6); - vxor(*out1, *out1, x0); + xF700A600 = xF719A695 & ~a4; + x61008000 = x693CD926 & xF700A600; + x03B7856B = x00515001 ^ x03E6D56A; + x62B7056B = x61008000 ^ x03B7856B; + x00 = x62B7056B | a6; + x01 = x00 ^ xC729695A; + *out1 ^= x01; } #endif @@ -1452,60 +1539,6 @@ void DES (const u32 K00, const u32 K01, const u32 K02, const u32 K03, const u32 KXX_DECL u32 k36, k37, k38, k39, k40, k41; KXX_DECL u32 k42, k43, k44, k45, k46, k47; - #if defined IS_AMD || defined IS_GENERIC - - #ifdef _unroll - #pragma unroll - #endif - for (u32 i = 0; i < 8; i++) - { - switch (i) - { - case 0: KEYSET00; break; - case 1: KEYSET02; break; - case 2: KEYSET04; break; - case 3: KEYSET06; break; - case 4: KEYSET10; break; - case 5: KEYSET12; break; - case 6: KEYSET14; break; - case 7: KEYSET16; break; - } - - s1(*D63 ^ k00, *D32 ^ k01, *D33 ^ k02, *D34 ^ k03, *D35 ^ k04, *D36 ^ k05, D08, D16, D22, D30); - s2(*D35 ^ k06, *D36 ^ k07, *D37 ^ k08, *D38 ^ k09, *D39 ^ k10, *D40 ^ k11, D12, D27, D01, D17); - s3(*D39 ^ k12, *D40 ^ k13, *D41 ^ k14, *D42 ^ k15, *D43 ^ k16, *D44 ^ k17, D23, D15, D29, D05); - s4(*D43 ^ k18, *D44 ^ k19, *D45 ^ k20, *D46 ^ k21, *D47 ^ k22, *D48 ^ k23, D25, D19, D09, D00); - s5(*D47 ^ k24, *D48 ^ k25, *D49 ^ k26, *D50 ^ k27, *D51 ^ k28, *D52 ^ k29, D07, D13, D24, D02); - s6(*D51 ^ k30, *D52 ^ k31, *D53 ^ k32, *D54 ^ k33, *D55 ^ k34, *D56 ^ k35, D03, D28, D10, D18); - s7(*D55 ^ k36, *D56 ^ k37, *D57 ^ k38, *D58 ^ k39, *D59 ^ k40, *D60 ^ k41, D31, D11, D21, D06); - s8(*D59 ^ k42, *D60 ^ k43, *D61 ^ k44, *D62 ^ k45, *D63 ^ k46, *D32 ^ k47, D04, D26, D14, D20); - - switch (i) - { - case 0: KEYSET01; break; - case 1: KEYSET03; break; - case 2: KEYSET05; break; - case 3: KEYSET07; break; - case 4: KEYSET11; break; - case 5: KEYSET13; break; - case 6: KEYSET15; break; - case 7: KEYSET17; break; - } - - s1(*D31 ^ k00, *D00 ^ k01, *D01 ^ k02, *D02 ^ k03, *D03 ^ k04, *D04 ^ k05, D40, D48, D54, D62); - s2(*D03 ^ k06, *D04 ^ k07, *D05 ^ k08, *D06 ^ k09, *D07 ^ k10, *D08 ^ k11, D44, D59, D33, D49); - s3(*D07 ^ k12, *D08 ^ k13, *D09 ^ k14, *D10 ^ k15, *D11 ^ k16, *D12 ^ k17, D55, D47, D61, D37); - s4(*D11 ^ k18, *D12 ^ k19, *D13 ^ k20, *D14 ^ k21, *D15 ^ k22, *D16 ^ k23, D57, D51, D41, D32); - s5(*D15 ^ k24, *D16 ^ k25, *D17 ^ k26, *D18 ^ k27, *D19 ^ k28, *D20 ^ k29, D39, D45, D56, D34); - s6(*D19 ^ k30, *D20 ^ k31, *D21 ^ k32, *D22 ^ k33, *D23 ^ k34, *D24 ^ k35, D35, D60, D42, D50); - s7(*D23 ^ k36, *D24 ^ k37, *D25 ^ k38, *D26 ^ k39, *D27 ^ k40, *D28 ^ k41, D63, D43, D53, D38); - s8(*D27 ^ k42, *D28 ^ k43, *D29 ^ k44, *D30 ^ k45, *D31 ^ k46, *D00 ^ k47, D36, D58, D46, D52); - } - - #endif - - #if defined IS_NV - #ifdef _unroll #pragma unroll #endif @@ -1599,8 +1632,6 @@ void DES (const u32 K00, const u32 K01, const u32 K02, const u32 K03, const u32 s7(*D23 ^ k36, *D24 ^ k37, *D25 ^ k38, *D26 ^ k39, *D27 ^ k40, *D28 ^ k41, D63, D43, D53, D38); s8(*D27 ^ k42, *D28 ^ k43, *D29 ^ k44, *D30 ^ k45, *D31 ^ k46, *D00 ^ k47, D36, D58, D46, D52); } - - #endif } void transpose32c (u32 data[32]) diff --git a/OpenCL/markov_be.cl b/OpenCL/markov_be.cl index b178259ed..b62775c43 100644 --- a/OpenCL/markov_be.cl +++ b/OpenCL/markov_be.cl @@ -9,7 +9,7 @@ #include "inc_types.cl" -inline void generate_pw (u32 pw_buf[64], __global const cs_t *root_css_buf, __global const cs_t *markov_css_buf, const u32 pw_l_len, const u32 pw_r_len, const u32 mask80, const u32 bits14, const u32 bits15, u64 val) +void generate_pw (u32 pw_buf[64], __global const cs_t *root_css_buf, __global const cs_t *markov_css_buf, const u32 pw_l_len, const u32 pw_r_len, const u32 mask80, const u32 bits14, const u32 bits15, u64 val) { __global const cs_t *cs = &root_css_buf[pw_r_len]; diff --git a/OpenCL/markov_le.cl b/OpenCL/markov_le.cl index a90fc489c..2d7babff4 100644 --- a/OpenCL/markov_le.cl +++ b/OpenCL/markov_le.cl @@ -9,7 +9,7 @@ #include "inc_types.cl" -inline void generate_pw (u32 pw_buf[64], __global const cs_t *root_css_buf, __global const cs_t *markov_css_buf, const u32 pw_l_len, const u32 pw_r_len, const u32 mask80, const u32 bits14, const u32 bits15, u64 val) +void generate_pw (u32 pw_buf[64], __global const cs_t *root_css_buf, __global const cs_t *markov_css_buf, const u32 pw_l_len, const u32 pw_r_len, const u32 mask80, const u32 bits14, const u32 bits15, u64 val) { __global const cs_t *cs = &root_css_buf[pw_r_len]; From b847bbb2743f9e117e1db0f0288c8e427a36cd9e Mon Sep 17 00:00:00 2001 From: jsteube Date: Sat, 22 Jul 2017 18:46:12 +0200 Subject: [PATCH 10/75] Fix calculation of device_name_chksum; should be done for each iteration --- include/types.h | 3 ++- src/opencl.c | 50 +++++++++++++++++++++++++------------------------ 2 files changed, 28 insertions(+), 25 deletions(-) diff --git a/include/types.h b/include/types.h index 07d1095ee..36a83a073 100644 --- a/include/types.h +++ b/include/types.h @@ -1008,7 +1008,6 @@ typedef struct hc_device_param char *device_name; char *device_vendor; - char *device_name_chksum; char *device_version; char *driver_version; char *device_opencl_version; @@ -1142,6 +1141,8 @@ typedef struct opencl_ctx bool need_xnvctrl; bool need_sysfs; + int comptime; + int force_jit_compilation; } opencl_ctx_t; diff --git a/src/opencl.c b/src/opencl.c index 3fa5d0280..13648276a 100644 --- a/src/opencl.c +++ b/src/opencl.c @@ -3121,27 +3121,6 @@ int opencl_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) device_param->driver_version = driver_version; - // device_name_chksum - - char *device_name_chksum = (char *) hcmalloc (HCBUFSIZ_TINY); - - #if defined (__x86_64__) - const size_t dnclen = snprintf (device_name_chksum, HCBUFSIZ_TINY - 1, "%d-%u-%u-%s-%s-%s-%d-%u-%u", 64, device_param->platform_vendor_id, device_param->vector_width, device_param->device_name, device_param->device_version, device_param->driver_version, comptime, user_options->opencl_vector_width, user_options->hash_mode); - #else - const size_t dnclen = snprintf (device_name_chksum, HCBUFSIZ_TINY - 1, "%d-%u-%u-%s-%s-%s-%d-%u-%u", 32, device_param->platform_vendor_id, device_param->vector_width, device_param->device_name, device_param->device_version, device_param->driver_version, comptime, user_options->opencl_vector_width, user_options->hash_mode); - #endif - - u32 device_name_digest[4] = { 0 }; - - for (size_t i = 0; i < dnclen; i += 64) - { - md5_64 ((u32 *) (device_name_chksum + i), device_name_digest); - } - - snprintf (device_name_chksum, HCBUFSIZ_TINY - 1, "%08x", device_name_digest[0]); - - device_param->device_name_chksum = device_name_chksum; - // vendor specific if (device_param->device_type & CL_DEVICE_TYPE_GPU) @@ -3436,6 +3415,8 @@ int opencl_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) opencl_ctx->need_xnvctrl = need_xnvctrl; opencl_ctx->need_sysfs = need_sysfs; + opencl_ctx->comptime = comptime; + return 0; } @@ -3459,7 +3440,6 @@ void opencl_ctx_devices_destroy (hashcat_ctx_t *hashcat_ctx) if (device_param->skipped == true) continue; hcfree (device_param->device_name); - hcfree (device_param->device_name_chksum); hcfree (device_param->device_version); hcfree (device_param->driver_version); hcfree (device_param->device_opencl_version); @@ -3810,8 +3790,7 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) * device properties */ - const char *device_name_chksum = device_param->device_name_chksum; - const u32 device_processors = device_param->device_processors; + const u32 device_processors = device_param->device_processors; /** * create context for each device @@ -4253,6 +4232,27 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: build_opts '%s'", device_id + 1, build_opts); #endif + /** + * device_name_chksum + */ + + char *device_name_chksum = (char *) hcmalloc (HCBUFSIZ_TINY); + + #if defined (__x86_64__) + const size_t dnclen = snprintf (device_name_chksum, HCBUFSIZ_TINY - 1, "%d-%u-%u-%s-%s-%s-%d-%u-%u", 64, device_param->platform_vendor_id, device_param->vector_width, device_param->device_name, device_param->device_version, device_param->driver_version, opencl_ctx->comptime, user_options->opencl_vector_width, user_options->hash_mode); + #else + const size_t dnclen = snprintf (device_name_chksum, HCBUFSIZ_TINY - 1, "%d-%u-%u-%s-%s-%s-%d-%u-%u", 32, device_param->platform_vendor_id, device_param->vector_width, device_param->device_name, device_param->device_version, device_param->driver_version, opencl_ctx->comptime, user_options->opencl_vector_width, user_options->hash_mode); + #endif + + u32 device_name_digest[4] = { 0 }; + + for (size_t i = 0; i < dnclen; i += 64) + { + md5_64 ((u32 *) (device_name_chksum + i), device_name_digest); + } + + snprintf (device_name_chksum, HCBUFSIZ_TINY - 1, "%08x", device_name_digest[0]); + /** * main kernel */ @@ -4733,6 +4733,8 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) hcfree (kernel_sources[0]); } + hcfree (device_name_chksum); + // return back to the folder we came from initially (workaround) if (chdir (folder_config->cwd) == -1) From c255a967df2635612bd334074e656f3807c1fa42 Mon Sep 17 00:00:00 2001 From: jsteube Date: Sat, 22 Jul 2017 18:59:01 +0200 Subject: [PATCH 11/75] Fix some types in rotate functions --- OpenCL/inc_types.cl | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/OpenCL/inc_types.cl b/OpenCL/inc_types.cl index 78ecd9988..7737c7235 100644 --- a/OpenCL/inc_types.cl +++ b/OpenCL/inc_types.cl @@ -186,7 +186,7 @@ u64 swap64_S (const u64 v) u32 rotr32_S (const u32 a, const u32 n) { - return rotate (a, 32 - n); + return rotate (a, (32 - n)); } u32 rotl32_S (const u32 a, const u32 n) @@ -226,7 +226,7 @@ u64x swap64 (const u64x v) u32x rotr32 (const u32x a, const u32 n) { - return rotate (a, 32 - n); + return rotate (a, (32 - n)); } u32x rotl32 (const u32x a, const u32 n) @@ -273,7 +273,7 @@ u64 swap64_S (const u64 v) u32 rotr32_S (const u32 a, const u32 n) { - return rotate (a, 32 - n); + return rotate (a, (32 - n)); } u32 rotl32_S (const u32 a, const u32 n) @@ -283,12 +283,12 @@ u32 rotl32_S (const u32 a, const u32 n) u64 rotr64_S (const u64 a, const u32 n) { - return rotate (a, (u64) 64 - n); + return rotate (a, (u64) (64 - n)); } u64 rotl64_S (const u64 a, const u32 n) { - return rotr64_S (a, 64 - n); + return rotate (a, (u64) n); } u32x swap32 (const u32x v) @@ -313,7 +313,7 @@ u64x swap64 (const u64x v) u32x rotr32 (const u32x a, const u32 n) { - return rotate (a, 32 - n); + return rotate (a, (32 - n)); } u32x rotl32 (const u32x a, const u32 n) @@ -323,12 +323,12 @@ u32x rotl32 (const u32x a, const u32 n) u64x rotr64 (const u64x a, const u32 n) { - return rotate (a, (u64) 64 - n); + return rotate (a, (u64x) (64 - n)); } u64x rotl64 (const u64x a, const u32 n) { - return rotate (a, (u64) n); + return rotate (a, (u64x) n); } u32x __byte_perm (const u32x a, const u32x b, const u32x c) @@ -505,7 +505,7 @@ u64 swap64_S (const u64 v) u32 rotr32_S (const u32 a, const u32 n) { - return rotate (a, 32 - n); + return rotate (a, (32 - n)); } u32 rotl32_S (const u32 a, const u32 n) @@ -515,7 +515,7 @@ u32 rotl32_S (const u32 a, const u32 n) u64 rotr64_S (const u64 a, const u32 n) { - return rotate (a, (u64) 64 - n); + return rotate (a, (u64) (64 - n)); } u64 rotl64_S (const u64 a, const u32 n) @@ -545,7 +545,7 @@ u64x swap64 (const u64x v) u32x rotr32 (const u32x a, const u32 n) { - return rotate (a, 32 - n); + return rotate (a, (32 - n)); } u32x rotl32 (const u32x a, const u32 n) @@ -555,12 +555,12 @@ u32x rotl32 (const u32x a, const u32 n) u64x rotr64 (const u64x a, const u32 n) { - return rotate (a, (u64) 64 - n); + return rotate (a, (u64x) (64 - n)); } u64x rotl64 (const u64x a, const u32 n) { - return rotate (a, (u64) n); + return rotate (a, (u64x) n); } u32x __bfe (const u32x a, const u32x b, const u32x c) From 4c71bc984eb473458cb982312ea75a6a4e578035 Mon Sep 17 00:00:00 2001 From: jsteube Date: Sun, 23 Jul 2017 13:01:54 +0200 Subject: [PATCH 12/75] Fix const keywords in -m 8600 --- OpenCL/m08600_a0-optimized.cl | 6 +++--- OpenCL/m08600_a1-optimized.cl | 6 +++--- OpenCL/m08600_a3-optimized.cl | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/OpenCL/m08600_a0-optimized.cl b/OpenCL/m08600_a0-optimized.cl index 8e5b7e9a2..d64bcc9ba 100644 --- a/OpenCL/m08600_a0-optimized.cl +++ b/OpenCL/m08600_a0-optimized.cl @@ -86,7 +86,7 @@ void lotus_mix (u32x *in, __local u32 *s_lotus_magic_table) } } -void lotus_transform_password (u32x in[4], u32x out[4], __local u32 *s_lotus_magic_table) +void lotus_transform_password (const u32x in[4], u32x out[4], __local u32 *s_lotus_magic_table) { u32x t = out[3] >> 24; @@ -183,7 +183,7 @@ void pad (u32 w[4], const u32 len) } } -void mdtransform_norecalc (u32x state[4], u32x block[4], __local u32 *s_lotus_magic_table) +void mdtransform_norecalc (u32x state[4], const u32x block[4], __local u32 *s_lotus_magic_table) { u32x x[12]; @@ -208,7 +208,7 @@ void mdtransform_norecalc (u32x state[4], u32x block[4], __local u32 *s_lotus_ma state[3] = x[3]; } -void mdtransform (u32x state[4], u32x checksum[4], u32x block[4], __local u32 *s_lotus_magic_table) +void mdtransform (u32x state[4], u32x checksum[4], const u32x block[4], __local u32 *s_lotus_magic_table) { mdtransform_norecalc (state, block, s_lotus_magic_table); diff --git a/OpenCL/m08600_a1-optimized.cl b/OpenCL/m08600_a1-optimized.cl index e8b143ca2..28aaa4c45 100644 --- a/OpenCL/m08600_a1-optimized.cl +++ b/OpenCL/m08600_a1-optimized.cl @@ -84,7 +84,7 @@ void lotus_mix (u32x *in, __local u32 *s_lotus_magic_table) } } -void lotus_transform_password (u32x in[4], u32x out[4], __local u32 *s_lotus_magic_table) +void lotus_transform_password (const u32x in[4], u32x out[4], __local u32 *s_lotus_magic_table) { u32x t = out[3] >> 24; @@ -181,7 +181,7 @@ void pad (u32 w[4], const u32 len) } } -void mdtransform_norecalc (u32x state[4], u32x block[4], __local u32 *s_lotus_magic_table) +void mdtransform_norecalc (u32x state[4], const u32x block[4], __local u32 *s_lotus_magic_table) { u32x x[12]; @@ -206,7 +206,7 @@ void mdtransform_norecalc (u32x state[4], u32x block[4], __local u32 *s_lotus_ma state[3] = x[3]; } -void mdtransform (u32x state[4], u32x checksum[4], u32x block[4], __local u32 *s_lotus_magic_table) +void mdtransform (u32x state[4], u32x checksum[4], const u32x block[4], __local u32 *s_lotus_magic_table) { mdtransform_norecalc (state, block, s_lotus_magic_table); diff --git a/OpenCL/m08600_a3-optimized.cl b/OpenCL/m08600_a3-optimized.cl index f2ddc9df5..1d85b39d7 100644 --- a/OpenCL/m08600_a3-optimized.cl +++ b/OpenCL/m08600_a3-optimized.cl @@ -83,7 +83,7 @@ void lotus_mix (u32x *in, __local u32 *s_lotus_magic_table) } } -void lotus_transform_password (u32x in[4], u32x out[4], __local u32 *s_lotus_magic_table) +void lotus_transform_password (const u32x in[4], u32x out[4], __local u32 *s_lotus_magic_table) { u32x t = out[3] >> 24; @@ -180,7 +180,7 @@ void pad (u32 w[4], const u32 len) } } -void mdtransform_norecalc (u32x state[4], u32x block[4], __local u32 *s_lotus_magic_table) +void mdtransform_norecalc (u32x state[4], const u32x block[4], __local u32 *s_lotus_magic_table) { u32x x[12]; @@ -205,7 +205,7 @@ void mdtransform_norecalc (u32x state[4], u32x block[4], __local u32 *s_lotus_ma state[3] = x[3]; } -void mdtransform (u32x state[4], u32x checksum[4], u32x block[4], __local u32 *s_lotus_magic_table) +void mdtransform (u32x state[4], u32x checksum[4], const u32x block[4], __local u32 *s_lotus_magic_table) { mdtransform_norecalc (state, block, s_lotus_magic_table); From f6f22f6616fc0f52c293dafb60aeb7a816f0ff13 Mon Sep 17 00:00:00 2001 From: jsteube Date: Sun, 23 Jul 2017 13:55:25 +0200 Subject: [PATCH 13/75] Use a different cache hash for amp and mp kernel --- src/opencl.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/src/opencl.c b/src/opencl.c index 13648276a..b9f6f8265 100644 --- a/src/opencl.c +++ b/src/opencl.c @@ -402,15 +402,15 @@ void generate_source_kernel_mp_filename (const u32 opti_type, const u64 opts_typ } } -void generate_cached_kernel_mp_filename (const u32 opti_type, const u64 opts_type, char *profile_dir, const char *device_name_chksum, char *cached_file) +void generate_cached_kernel_mp_filename (const u32 opti_type, const u64 opts_type, char *profile_dir, const char *device_name_chksum_amp_mp, char *cached_file) { if ((opti_type & OPTI_TYPE_BRUTE_FORCE) && (opts_type & OPTS_TYPE_PT_GENERATE_BE)) { - snprintf (cached_file, 255, "%s/kernels/markov_be.%s.kernel", profile_dir, device_name_chksum); + snprintf (cached_file, 255, "%s/kernels/markov_be.%s.kernel", profile_dir, device_name_chksum_amp_mp); } else { - snprintf (cached_file, 255, "%s/kernels/markov_le.%s.kernel", profile_dir, device_name_chksum); + snprintf (cached_file, 255, "%s/kernels/markov_le.%s.kernel", profile_dir, device_name_chksum_amp_mp); } } @@ -419,9 +419,9 @@ void generate_source_kernel_amp_filename (const u32 attack_kern, char *shared_di snprintf (source_file, 255, "%s/OpenCL/amp_a%u.cl", shared_dir, attack_kern); } -void generate_cached_kernel_amp_filename (const u32 attack_kern, char *profile_dir, const char *device_name_chksum, char *cached_file) +void generate_cached_kernel_amp_filename (const u32 attack_kern, char *profile_dir, const char *device_name_chksum_amp_mp, char *cached_file) { - snprintf (cached_file, 255, "%s/kernels/amp_a%u.%s.kernel", profile_dir, attack_kern, device_name_chksum); + snprintf (cached_file, 255, "%s/kernels/amp_a%u.%s.kernel", profile_dir, attack_kern, device_name_chksum_amp_mp); } int ocl_init (hashcat_ctx_t *hashcat_ctx) @@ -4236,12 +4236,15 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) * device_name_chksum */ - char *device_name_chksum = (char *) hcmalloc (HCBUFSIZ_TINY); + char *device_name_chksum = (char *) hcmalloc (HCBUFSIZ_TINY); + char *device_name_chksum_amp_mp = (char *) hcmalloc (HCBUFSIZ_TINY); #if defined (__x86_64__) - const size_t dnclen = snprintf (device_name_chksum, HCBUFSIZ_TINY - 1, "%d-%u-%u-%s-%s-%s-%d-%u-%u", 64, device_param->platform_vendor_id, device_param->vector_width, device_param->device_name, device_param->device_version, device_param->driver_version, opencl_ctx->comptime, user_options->opencl_vector_width, user_options->hash_mode); + const size_t dnclen = snprintf (device_name_chksum, HCBUFSIZ_TINY - 1, "%d-%u-%u-%s-%s-%s-%d-%u-%u", 64, device_param->platform_vendor_id, device_param->vector_width, device_param->device_name, device_param->device_version, device_param->driver_version, opencl_ctx->comptime, user_options->opencl_vector_width, user_options->hash_mode); + const size_t dnclen_amp_mp = snprintf (device_name_chksum_amp_mp, HCBUFSIZ_TINY - 1, "%d-%u-%s-%s-%s-%d", 64, device_param->platform_vendor_id, device_param->device_name, device_param->device_version, device_param->driver_version, opencl_ctx->comptime); #else - const size_t dnclen = snprintf (device_name_chksum, HCBUFSIZ_TINY - 1, "%d-%u-%u-%s-%s-%s-%d-%u-%u", 32, device_param->platform_vendor_id, device_param->vector_width, device_param->device_name, device_param->device_version, device_param->driver_version, opencl_ctx->comptime, user_options->opencl_vector_width, user_options->hash_mode); + const size_t dnclen = snprintf (device_name_chksum, HCBUFSIZ_TINY - 1, "%d-%u-%u-%s-%s-%s-%d-%u-%u", 32, device_param->platform_vendor_id, device_param->vector_width, device_param->device_name, device_param->device_version, device_param->driver_version, opencl_ctx->comptime, user_options->opencl_vector_width, user_options->hash_mode); + const size_t dnclen_amp_mp = snprintf (device_name_chksum_amp_mp, HCBUFSIZ_TINY - 1, "%d-%u-%s-%s-%s-%d", 32, device_param->platform_vendor_id, device_param->device_name, device_param->device_version, device_param->driver_version, opencl_ctx->comptime); #endif u32 device_name_digest[4] = { 0 }; @@ -4253,6 +4256,15 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) snprintf (device_name_chksum, HCBUFSIZ_TINY - 1, "%08x", device_name_digest[0]); + u32 device_name_digest_amp_mp[4] = { 0 }; + + for (size_t i = 0; i < dnclen_amp_mp; i += 64) + { + md5_64 ((u32 *) (device_name_chksum_amp_mp + i), device_name_digest_amp_mp); + } + + snprintf (device_name_chksum_amp_mp, HCBUFSIZ_TINY - 1, "%08x", device_name_digest_amp_mp[0]); + /** * main kernel */ @@ -4482,7 +4494,7 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) char cached_file[256] = { 0 }; - generate_cached_kernel_mp_filename (hashconfig->opti_type, hashconfig->opts_type, folder_config->profile_dir, device_name_chksum, cached_file); + generate_cached_kernel_mp_filename (hashconfig->opti_type, hashconfig->opts_type, folder_config->profile_dir, device_name_chksum_amp_mp, cached_file); bool cached = true; @@ -4623,7 +4635,7 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) char cached_file[256] = { 0 }; - generate_cached_kernel_amp_filename (user_options_extra->attack_kern, folder_config->profile_dir, device_name_chksum, cached_file); + generate_cached_kernel_amp_filename (user_options_extra->attack_kern, folder_config->profile_dir, device_name_chksum_amp_mp, cached_file); bool cached = true; @@ -4734,6 +4746,7 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) } hcfree (device_name_chksum); + hcfree (device_name_chksum_amp_mp); // return back to the folder we came from initially (workaround) From 4dca908cdf9dfe2925a834664194bccb3c3a8655 Mon Sep 17 00:00:00 2001 From: jsteube Date: Sun, 23 Jul 2017 14:06:32 +0200 Subject: [PATCH 14/75] Fix a typo in OpenCL/m01460_a3-optimized.cl --- OpenCL/m01460_a3-optimized.cl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/OpenCL/m01460_a3-optimized.cl b/OpenCL/m01460_a3-optimized.cl index 156b75bcc..d20d313ff 100644 --- a/OpenCL/m01460_a3-optimized.cl +++ b/OpenCL/m01460_a3-optimized.cl @@ -221,7 +221,7 @@ void hmac_sha256_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[ sha256_transform (w0, w1, w2, w3, digest); } -void m01460m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global void *esal_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +void m01460m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier From 3125a756d9a08261d04595e6fad5263d85514b1d Mon Sep 17 00:00:00 2001 From: jsteube Date: Sun, 23 Jul 2017 14:44:20 +0200 Subject: [PATCH 15/75] Remove some AMD _unroll restrictions no longer required with ROCm --- OpenCL/inc_vendor.cl | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/OpenCL/inc_vendor.cl b/OpenCL/inc_vendor.cl index a238286ea..e461176c0 100644 --- a/OpenCL/inc_vendor.cl +++ b/OpenCL/inc_vendor.cl @@ -186,27 +186,6 @@ #ifdef IS_AMD #ifdef IS_GPU -#if KERN_TYPE == 1700 -#undef _unroll -#endif -#if KERN_TYPE == 1710 -#undef _unroll -#endif -#if KERN_TYPE == 5200 -#undef _unroll -#endif -#if KERN_TYPE == 10800 -#undef _unroll -#endif -#if KERN_TYPE == 10900 -#undef _unroll -#endif -#if KERN_TYPE == 12800 -#undef _unroll -#endif -#if KERN_TYPE == 12900 -#undef _unroll -#endif #endif #endif From 9562d072647e7282e99342ebe693f848cfe1062c Mon Sep 17 00:00:00 2001 From: jsteube Date: Sun, 23 Jul 2017 17:01:15 +0200 Subject: [PATCH 16/75] Replace bitwise swaps with rotate() versions for AMD --- OpenCL/inc_types.cl | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/OpenCL/inc_types.cl b/OpenCL/inc_types.cl index 7737c7235..c31bc0046 100644 --- a/OpenCL/inc_types.cl +++ b/OpenCL/inc_types.cl @@ -176,12 +176,16 @@ u64x hl32_to_64 (const u32x a, const u32x b) #ifdef IS_AMD u32 swap32_S (const u32 v) { - return (as_uint (as_uchar4 (v).s3210)); + return bitselect (rotate (v, 24u), rotate (v, 8u), 0x00ff00ffu); } u64 swap64_S (const u64 v) { - return (as_ulong (as_uchar8 (v).s76543210)); + return bitselect (bitselect (rotate (v, 24ul), + rotate (v, 8ul), 0x000000ff000000fful), + bitselect (rotate (v, 56ul), + rotate (v, 40ul), 0x00ff000000ff0000ul), + 0xffff0000ffff0000ul); } u32 rotr32_S (const u32 a, const u32 n) @@ -206,22 +210,16 @@ u64 rotl64_S (const u64 a, const u32 n) u32x swap32 (const u32x v) { - return ((v >> 24) & 0x000000ff) - | ((v >> 8) & 0x0000ff00) - | ((v << 8) & 0x00ff0000) - | ((v << 24) & 0xff000000); + return bitselect (rotate (v, 24u), rotate (v, 8u), 0x00ff00ffu); } u64x swap64 (const u64x v) { - return ((v >> 56) & 0x00000000000000ff) - | ((v >> 40) & 0x000000000000ff00) - | ((v >> 24) & 0x0000000000ff0000) - | ((v >> 8) & 0x00000000ff000000) - | ((v << 8) & 0x000000ff00000000) - | ((v << 24) & 0x0000ff0000000000) - | ((v << 40) & 0x00ff000000000000) - | ((v << 56) & 0xff00000000000000); + return bitselect (bitselect (rotate (v, 24ul), + rotate (v, 8ul), 0x000000ff000000fful), + bitselect (rotate (v, 56ul), + rotate (v, 40ul), 0x00ff000000ff0000ul), + 0xffff0000ffff0000ul); } u32x rotr32 (const u32x a, const u32 n) From 772441448ad38feb0e4d78db1a40606b00efe5aa Mon Sep 17 00:00:00 2001 From: jsteube Date: Mon, 24 Jul 2017 13:13:35 +0200 Subject: [PATCH 17/75] Optimized -m 8000 for ROCm --- OpenCL/inc_vendor.cl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/OpenCL/inc_vendor.cl b/OpenCL/inc_vendor.cl index e461176c0..e6acd7d23 100644 --- a/OpenCL/inc_vendor.cl +++ b/OpenCL/inc_vendor.cl @@ -186,6 +186,9 @@ #ifdef IS_AMD #ifdef IS_GPU +#if KERN_TYPE == 8000 +#undef _unroll +#endif #endif #endif From 5bcda7d05a908d45e2d070b5ad9276a00f4c0f0d Mon Sep 17 00:00:00 2001 From: jsteube Date: Mon, 24 Jul 2017 13:18:38 +0200 Subject: [PATCH 18/75] Optimized -m 5300 and -m 5400 for ROCm --- OpenCL/m05300_a3-optimized.cl | 126 ++++++++++------------------------ OpenCL/m05400_a3-optimized.cl | 126 ++++++++++------------------------ 2 files changed, 72 insertions(+), 180 deletions(-) diff --git a/OpenCL/m05300_a3-optimized.cl b/OpenCL/m05300_a3-optimized.cl index 52e48240d..9ea662939 100644 --- a/OpenCL/m05300_a3-optimized.cl +++ b/OpenCL/m05300_a3-optimized.cl @@ -195,7 +195,7 @@ void hmac_md5_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[4], md5_transform (w0, w1, w2, w3, digest); } -void m05300m (__local u32 *w_s, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *s_msg_buf) +void m05300m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *s_msg_buf) { /** * modifier @@ -254,20 +254,20 @@ void m05300m (__local u32 *w_s, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], cons hmac_md5_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); - w0_t[0] = w_s[ 0]; - w0_t[1] = w_s[ 1]; - w0_t[2] = w_s[ 2]; - w0_t[3] = w_s[ 3]; - w1_t[0] = w_s[ 4]; - w1_t[1] = w_s[ 5]; - w1_t[2] = w_s[ 6]; - w1_t[3] = w_s[ 7]; - w2_t[0] = w_s[ 8]; - w2_t[1] = w_s[ 9]; - w2_t[2] = w_s[10]; - w2_t[3] = w_s[11]; - w3_t[0] = w_s[12]; - w3_t[1] = w_s[13]; + w0_t[0] = ikepsk_bufs[digests_offset].nr_buf[ 0]; + w0_t[1] = ikepsk_bufs[digests_offset].nr_buf[ 1]; + w0_t[2] = ikepsk_bufs[digests_offset].nr_buf[ 2]; + w0_t[3] = ikepsk_bufs[digests_offset].nr_buf[ 3]; + w1_t[0] = ikepsk_bufs[digests_offset].nr_buf[ 4]; + w1_t[1] = ikepsk_bufs[digests_offset].nr_buf[ 5]; + w1_t[2] = ikepsk_bufs[digests_offset].nr_buf[ 6]; + w1_t[3] = ikepsk_bufs[digests_offset].nr_buf[ 7]; + w2_t[0] = ikepsk_bufs[digests_offset].nr_buf[ 8]; + w2_t[1] = ikepsk_bufs[digests_offset].nr_buf[ 9]; + w2_t[2] = ikepsk_bufs[digests_offset].nr_buf[10]; + w2_t[3] = ikepsk_bufs[digests_offset].nr_buf[11]; + w3_t[0] = ikepsk_bufs[digests_offset].nr_buf[12]; + w3_t[1] = ikepsk_bufs[digests_offset].nr_buf[13]; w3_t[2] = (64 + nr_len) * 8; w3_t[3] = 0; @@ -342,7 +342,7 @@ void m05300m (__local u32 *w_s, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], cons } } -void m05300s (__local u32 *w_s, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *s_msg_buf) +void m05300s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *s_msg_buf) { /** * modifier @@ -413,20 +413,20 @@ void m05300s (__local u32 *w_s, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], cons hmac_md5_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); - w0_t[0] = w_s[ 0]; - w0_t[1] = w_s[ 1]; - w0_t[2] = w_s[ 2]; - w0_t[3] = w_s[ 3]; - w1_t[0] = w_s[ 4]; - w1_t[1] = w_s[ 5]; - w1_t[2] = w_s[ 6]; - w1_t[3] = w_s[ 7]; - w2_t[0] = w_s[ 8]; - w2_t[1] = w_s[ 9]; - w2_t[2] = w_s[10]; - w2_t[3] = w_s[11]; - w3_t[0] = w_s[12]; - w3_t[1] = w_s[13]; + w0_t[0] = ikepsk_bufs[digests_offset].nr_buf[ 0]; + w0_t[1] = ikepsk_bufs[digests_offset].nr_buf[ 1]; + w0_t[2] = ikepsk_bufs[digests_offset].nr_buf[ 2]; + w0_t[3] = ikepsk_bufs[digests_offset].nr_buf[ 3]; + w1_t[0] = ikepsk_bufs[digests_offset].nr_buf[ 4]; + w1_t[1] = ikepsk_bufs[digests_offset].nr_buf[ 5]; + w1_t[2] = ikepsk_bufs[digests_offset].nr_buf[ 6]; + w1_t[3] = ikepsk_bufs[digests_offset].nr_buf[ 7]; + w2_t[0] = ikepsk_bufs[digests_offset].nr_buf[ 8]; + w2_t[1] = ikepsk_bufs[digests_offset].nr_buf[ 9]; + w2_t[2] = ikepsk_bufs[digests_offset].nr_buf[10]; + w2_t[3] = ikepsk_bufs[digests_offset].nr_buf[11]; + w3_t[0] = ikepsk_bufs[digests_offset].nr_buf[12]; + w3_t[1] = ikepsk_bufs[digests_offset].nr_buf[13]; w3_t[2] = (64 + nr_len) * 8; w3_t[3] = 0; @@ -515,15 +515,6 @@ __kernel void m05300_m04 (__global pw_t *pws, __global const kernel_rule_t *rule * s_msg */ - __local u32 w_s[16]; - - for (u32 i = lid; i < 16; i += lsz) - { - w_s[i] = ikepsk_bufs[digests_offset].nr_buf[i]; - } - - barrier (CLK_LOCAL_MEM_FENCE); - __local u32 s_msg_buf[128]; for (u32 i = lid; i < 128; i += lsz) @@ -573,7 +564,7 @@ __kernel void m05300_m04 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05300m (w_s, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); + m05300m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); } __kernel void m05300_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -590,15 +581,6 @@ __kernel void m05300_m08 (__global pw_t *pws, __global const kernel_rule_t *rule * s_msg */ - __local u32 w_s[16]; - - for (u32 i = lid; i < 16; i += lsz) - { - w_s[i] = ikepsk_bufs[digests_offset].nr_buf[i]; - } - - barrier (CLK_LOCAL_MEM_FENCE); - __local u32 s_msg_buf[128]; for (u32 i = lid; i < 128; i += lsz) @@ -648,7 +630,7 @@ __kernel void m05300_m08 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05300m (w_s, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); + m05300m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); } __kernel void m05300_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -665,15 +647,6 @@ __kernel void m05300_m16 (__global pw_t *pws, __global const kernel_rule_t *rule * s_msg */ - __local u32 w_s[16]; - - for (u32 i = lid; i < 16; i += lsz) - { - w_s[i] = ikepsk_bufs[digests_offset].nr_buf[i]; - } - - barrier (CLK_LOCAL_MEM_FENCE); - __local u32 s_msg_buf[128]; for (u32 i = lid; i < 128; i += lsz) @@ -723,7 +696,7 @@ __kernel void m05300_m16 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05300m (w_s, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); + m05300m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); } __kernel void m05300_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -740,15 +713,6 @@ __kernel void m05300_s04 (__global pw_t *pws, __global const kernel_rule_t *rule * s_msg */ - __local u32 w_s[16]; - - for (u32 i = lid; i < 16; i += lsz) - { - w_s[i] = ikepsk_bufs[digests_offset].nr_buf[i]; - } - - barrier (CLK_LOCAL_MEM_FENCE); - __local u32 s_msg_buf[128]; for (u32 i = lid; i < 128; i += lsz) @@ -798,7 +762,7 @@ __kernel void m05300_s04 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05300s (w_s, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); + m05300s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); } __kernel void m05300_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -815,15 +779,6 @@ __kernel void m05300_s08 (__global pw_t *pws, __global const kernel_rule_t *rule * s_msg */ - __local u32 w_s[16]; - - for (u32 i = lid; i < 16; i += lsz) - { - w_s[i] = ikepsk_bufs[digests_offset].nr_buf[i]; - } - - barrier (CLK_LOCAL_MEM_FENCE); - __local u32 s_msg_buf[128]; for (u32 i = lid; i < 128; i += lsz) @@ -873,7 +828,7 @@ __kernel void m05300_s08 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05300s (w_s, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); + m05300s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); } __kernel void m05300_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -890,15 +845,6 @@ __kernel void m05300_s16 (__global pw_t *pws, __global const kernel_rule_t *rule * s_msg */ - __local u32 w_s[16]; - - for (u32 i = lid; i < 16; i += lsz) - { - w_s[i] = ikepsk_bufs[digests_offset].nr_buf[i]; - } - - barrier (CLK_LOCAL_MEM_FENCE); - __local u32 s_msg_buf[128]; for (u32 i = lid; i < 128; i += lsz) @@ -948,5 +894,5 @@ __kernel void m05300_s16 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05300s (w_s, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); + m05300s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); } diff --git a/OpenCL/m05400_a3-optimized.cl b/OpenCL/m05400_a3-optimized.cl index 4ad39067c..80097b8f0 100644 --- a/OpenCL/m05400_a3-optimized.cl +++ b/OpenCL/m05400_a3-optimized.cl @@ -229,7 +229,7 @@ void hmac_sha1_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[5] sha1_transform (w0, w1, w2, w3, digest); } -void m05400m (__local u32 *w_s, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *s_msg_buf) +void m05400m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *s_msg_buf) { /** * modifier @@ -288,20 +288,20 @@ void m05400m (__local u32 *w_s, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], cons hmac_sha1_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); - w0_t[0] = w_s[ 0]; - w0_t[1] = w_s[ 1]; - w0_t[2] = w_s[ 2]; - w0_t[3] = w_s[ 3]; - w1_t[0] = w_s[ 4]; - w1_t[1] = w_s[ 5]; - w1_t[2] = w_s[ 6]; - w1_t[3] = w_s[ 7]; - w2_t[0] = w_s[ 8]; - w2_t[1] = w_s[ 9]; - w2_t[2] = w_s[10]; - w2_t[3] = w_s[11]; - w3_t[0] = w_s[12]; - w3_t[1] = w_s[13]; + w0_t[0] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 0]); + w0_t[1] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 1]); + w0_t[2] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 2]); + w0_t[3] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 3]); + w1_t[0] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 4]); + w1_t[1] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 5]); + w1_t[2] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 6]); + w1_t[3] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 7]); + w2_t[0] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 8]); + w2_t[1] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 9]); + w2_t[2] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[10]); + w2_t[3] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[11]); + w3_t[0] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[12]); + w3_t[1] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[13]); w3_t[2] = 0; w3_t[3] = (64 + nr_len) * 8; @@ -376,7 +376,7 @@ void m05400m (__local u32 *w_s, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], cons } } -void m05400s (__local u32 *w_s, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *s_msg_buf) +void m05400s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *s_msg_buf) { /** * modifier @@ -447,20 +447,20 @@ void m05400s (__local u32 *w_s, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], cons hmac_sha1_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); - w0_t[0] = w_s[ 0]; - w0_t[1] = w_s[ 1]; - w0_t[2] = w_s[ 2]; - w0_t[3] = w_s[ 3]; - w1_t[0] = w_s[ 4]; - w1_t[1] = w_s[ 5]; - w1_t[2] = w_s[ 6]; - w1_t[3] = w_s[ 7]; - w2_t[0] = w_s[ 8]; - w2_t[1] = w_s[ 9]; - w2_t[2] = w_s[10]; - w2_t[3] = w_s[11]; - w3_t[0] = w_s[12]; - w3_t[1] = w_s[13]; + w0_t[0] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 0]); + w0_t[1] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 1]); + w0_t[2] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 2]); + w0_t[3] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 3]); + w1_t[0] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 4]); + w1_t[1] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 5]); + w1_t[2] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 6]); + w1_t[3] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 7]); + w2_t[0] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 8]); + w2_t[1] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[ 9]); + w2_t[2] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[10]); + w2_t[3] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[11]); + w3_t[0] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[12]); + w3_t[1] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[13]); w3_t[2] = 0; w3_t[3] = (64 + nr_len) * 8; @@ -549,15 +549,6 @@ __kernel void m05400_m04 (__global pw_t *pws, __global const kernel_rule_t *rule * s_msg */ - __local u32 w_s[16]; - - for (u32 i = lid; i < 16; i += lsz) - { - w_s[i] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[i]); - } - - barrier (CLK_LOCAL_MEM_FENCE); - __local u32 s_msg_buf[128]; for (u32 i = lid; i < 128; i += lsz) @@ -607,7 +598,7 @@ __kernel void m05400_m04 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05400m (w_s, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); + m05400m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); } __kernel void m05400_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -624,15 +615,6 @@ __kernel void m05400_m08 (__global pw_t *pws, __global const kernel_rule_t *rule * s_msg */ - __local u32 w_s[16]; - - for (u32 i = lid; i < 16; i += lsz) - { - w_s[i] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[i]); - } - - barrier (CLK_LOCAL_MEM_FENCE); - __local u32 s_msg_buf[128]; for (u32 i = lid; i < 128; i += lsz) @@ -682,7 +664,7 @@ __kernel void m05400_m08 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05400m (w_s, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); + m05400m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); } __kernel void m05400_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -699,15 +681,6 @@ __kernel void m05400_m16 (__global pw_t *pws, __global const kernel_rule_t *rule * s_msg */ - __local u32 w_s[16]; - - for (u32 i = lid; i < 16; i += lsz) - { - w_s[i] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[i]); - } - - barrier (CLK_LOCAL_MEM_FENCE); - __local u32 s_msg_buf[128]; for (u32 i = lid; i < 128; i += lsz) @@ -757,7 +730,7 @@ __kernel void m05400_m16 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05400m (w_s, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); + m05400m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); } __kernel void m05400_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -774,15 +747,6 @@ __kernel void m05400_s04 (__global pw_t *pws, __global const kernel_rule_t *rule * s_msg */ - __local u32 w_s[16]; - - for (u32 i = lid; i < 16; i += lsz) - { - w_s[i] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[i]); - } - - barrier (CLK_LOCAL_MEM_FENCE); - __local u32 s_msg_buf[128]; for (u32 i = lid; i < 128; i += lsz) @@ -832,7 +796,7 @@ __kernel void m05400_s04 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05400s (w_s, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); + m05400s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); } __kernel void m05400_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -849,15 +813,6 @@ __kernel void m05400_s08 (__global pw_t *pws, __global const kernel_rule_t *rule * s_msg */ - __local u32 w_s[16]; - - for (u32 i = lid; i < 16; i += lsz) - { - w_s[i] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[i]); - } - - barrier (CLK_LOCAL_MEM_FENCE); - __local u32 s_msg_buf[128]; for (u32 i = lid; i < 128; i += lsz) @@ -907,7 +862,7 @@ __kernel void m05400_s08 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05400s (w_s, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); + m05400s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); } __kernel void m05400_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -924,15 +879,6 @@ __kernel void m05400_s16 (__global pw_t *pws, __global const kernel_rule_t *rule * s_msg */ - __local u32 w_s[16]; - - for (u32 i = lid; i < 16; i += lsz) - { - w_s[i] = swap32_S (ikepsk_bufs[digests_offset].nr_buf[i]); - } - - barrier (CLK_LOCAL_MEM_FENCE); - __local u32 s_msg_buf[128]; for (u32 i = lid; i < 128; i += lsz) @@ -982,5 +928,5 @@ __kernel void m05400_s16 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05400s (w_s, w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); + m05400s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, ikepsk_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, s_msg_buf); } From 02e2279d594c57cb857201a825e06624f0ad3a5b Mon Sep 17 00:00:00 2001 From: jsteube Date: Mon, 24 Jul 2017 14:33:34 +0200 Subject: [PATCH 19/75] Optimized -m 8500 for ROCm --- OpenCL/m08500_a0-optimized.cl | 36 +++++++++--------------------- OpenCL/m08500_a1-optimized.cl | 38 +++++++++---------------------- OpenCL/m08500_a3-optimized.cl | 42 +++++++++++++---------------------- 3 files changed, 37 insertions(+), 79 deletions(-) diff --git a/OpenCL/m08500_a0-optimized.cl b/OpenCL/m08500_a0-optimized.cl index ba9bc66fd..5626b51ce 100644 --- a/OpenCL/m08500_a0-optimized.cl +++ b/OpenCL/m08500_a0-optimized.cl @@ -508,17 +508,17 @@ void _des_crypt_keysetup (u32x c, u32x d, u32x Kc[16], u32x Kd[16], __local u32 } } -void transform_racf_key (const u32x w0, const u32x w1, u32x key[2], __local u32 *s_ascii_to_ebcdic_pc) +void transform_racf_key (const u32x w0, const u32x w1, u32x key[2]) { - key[0] = BOX1 (((w0 >> 0) & 0xff), s_ascii_to_ebcdic_pc) << 0 - | BOX1 (((w0 >> 8) & 0xff), s_ascii_to_ebcdic_pc) << 8 - | BOX1 (((w0 >> 16) & 0xff), s_ascii_to_ebcdic_pc) << 16 - | BOX1 (((w0 >> 24) & 0xff), s_ascii_to_ebcdic_pc) << 24; + key[0] = BOX1 (((w0 >> 0) & 0xff), c_ascii_to_ebcdic_pc) << 0 + | BOX1 (((w0 >> 8) & 0xff), c_ascii_to_ebcdic_pc) << 8 + | BOX1 (((w0 >> 16) & 0xff), c_ascii_to_ebcdic_pc) << 16 + | BOX1 (((w0 >> 24) & 0xff), c_ascii_to_ebcdic_pc) << 24; - key[1] = BOX1 (((w1 >> 0) & 0xff), s_ascii_to_ebcdic_pc) << 0 - | BOX1 (((w1 >> 8) & 0xff), s_ascii_to_ebcdic_pc) << 8 - | BOX1 (((w1 >> 16) & 0xff), s_ascii_to_ebcdic_pc) << 16 - | BOX1 (((w1 >> 24) & 0xff), s_ascii_to_ebcdic_pc) << 24; + key[1] = BOX1 (((w1 >> 0) & 0xff), c_ascii_to_ebcdic_pc) << 0 + | BOX1 (((w1 >> 8) & 0xff), c_ascii_to_ebcdic_pc) << 8 + | BOX1 (((w1 >> 16) & 0xff), c_ascii_to_ebcdic_pc) << 16 + | BOX1 (((w1 >> 24) & 0xff), c_ascii_to_ebcdic_pc) << 24; } __kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -535,7 +535,6 @@ __kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rule * shared */ - __local u32 s_ascii_to_ebcdic_pc[256]; __local u32 s_SPtrans[8][64]; __local u32 s_skb[8][64]; @@ -560,13 +559,6 @@ __kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rule s_skb[7][i] = c_skb[7][i]; } - for (u32 i = lid; i < 256; i += lsz) - { - s_ascii_to_ebcdic_pc[i] = c_ascii_to_ebcdic_pc[i]; - } - - barrier (CLK_LOCAL_MEM_FENCE); - if (gid >= gid_max) return; /** @@ -615,7 +607,7 @@ __kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rule u32x key[2]; - transform_racf_key (w0[0], w0[1], key, s_ascii_to_ebcdic_pc); + transform_racf_key (w0[0], w0[1], key); const u32x c = key[0]; const u32x d = key[1]; @@ -662,7 +654,6 @@ __kernel void m08500_s04 (__global pw_t *pws, __global const kernel_rule_t *rule * shared */ - __local u32 s_ascii_to_ebcdic_pc[256]; __local u32 s_SPtrans[8][64]; __local u32 s_skb[8][64]; @@ -687,11 +678,6 @@ __kernel void m08500_s04 (__global pw_t *pws, __global const kernel_rule_t *rule s_skb[7][i] = c_skb[7][i]; } - for (u32 i = lid; i < 256; i += lsz) - { - s_ascii_to_ebcdic_pc[i] = c_ascii_to_ebcdic_pc[i]; - } - barrier (CLK_LOCAL_MEM_FENCE); if (gid >= gid_max) return; @@ -754,7 +740,7 @@ __kernel void m08500_s04 (__global pw_t *pws, __global const kernel_rule_t *rule u32x key[2]; - transform_racf_key (w0[0], w0[1], key, s_ascii_to_ebcdic_pc); + transform_racf_key (w0[0], w0[1], key); const u32x c = key[0]; const u32x d = key[1]; diff --git a/OpenCL/m08500_a1-optimized.cl b/OpenCL/m08500_a1-optimized.cl index f76728dd8..6b7dca17e 100644 --- a/OpenCL/m08500_a1-optimized.cl +++ b/OpenCL/m08500_a1-optimized.cl @@ -506,17 +506,17 @@ void _des_crypt_keysetup (u32x c, u32x d, u32x Kc[16], u32x Kd[16], __local u32 } } -void transform_racf_key (const u32x w0, const u32x w1, u32x key[2], __local u32 *s_ascii_to_ebcdic_pc) +void transform_racf_key (const u32x w0, const u32x w1, u32x key[2]) { - key[0] = BOX1 (((w0 >> 0) & 0xff), s_ascii_to_ebcdic_pc) << 0 - | BOX1 (((w0 >> 8) & 0xff), s_ascii_to_ebcdic_pc) << 8 - | BOX1 (((w0 >> 16) & 0xff), s_ascii_to_ebcdic_pc) << 16 - | BOX1 (((w0 >> 24) & 0xff), s_ascii_to_ebcdic_pc) << 24; + key[0] = BOX1 (((w0 >> 0) & 0xff), c_ascii_to_ebcdic_pc) << 0 + | BOX1 (((w0 >> 8) & 0xff), c_ascii_to_ebcdic_pc) << 8 + | BOX1 (((w0 >> 16) & 0xff), c_ascii_to_ebcdic_pc) << 16 + | BOX1 (((w0 >> 24) & 0xff), c_ascii_to_ebcdic_pc) << 24; - key[1] = BOX1 (((w1 >> 0) & 0xff), s_ascii_to_ebcdic_pc) << 0 - | BOX1 (((w1 >> 8) & 0xff), s_ascii_to_ebcdic_pc) << 8 - | BOX1 (((w1 >> 16) & 0xff), s_ascii_to_ebcdic_pc) << 16 - | BOX1 (((w1 >> 24) & 0xff), s_ascii_to_ebcdic_pc) << 24; + key[1] = BOX1 (((w1 >> 0) & 0xff), c_ascii_to_ebcdic_pc) << 0 + | BOX1 (((w1 >> 8) & 0xff), c_ascii_to_ebcdic_pc) << 8 + | BOX1 (((w1 >> 16) & 0xff), c_ascii_to_ebcdic_pc) << 16 + | BOX1 (((w1 >> 24) & 0xff), c_ascii_to_ebcdic_pc) << 24; } __kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -533,7 +533,6 @@ __kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rule * shared */ - __local u32 s_ascii_to_ebcdic_pc[256]; __local u32 s_SPtrans[8][64]; __local u32 s_skb[8][64]; @@ -558,13 +557,6 @@ __kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rule s_skb[7][i] = c_skb[7][i]; } - for (u32 i = lid; i < 256; i += lsz) - { - s_ascii_to_ebcdic_pc[i] = c_ascii_to_ebcdic_pc[i]; - } - - barrier (CLK_LOCAL_MEM_FENCE); - if (gid >= gid_max) return; /** @@ -656,7 +648,7 @@ __kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rule u32x key[2]; - transform_racf_key (w0[0], w0[1], key, s_ascii_to_ebcdic_pc); + transform_racf_key (w0[0], w0[1], key); const u32x c = key[0]; const u32x d = key[1]; @@ -703,7 +695,6 @@ __kernel void m08500_s04 (__global pw_t *pws, __global const kernel_rule_t *rule * shared */ - __local u32 s_ascii_to_ebcdic_pc[256]; __local u32 s_SPtrans[8][64]; __local u32 s_skb[8][64]; @@ -728,13 +719,6 @@ __kernel void m08500_s04 (__global pw_t *pws, __global const kernel_rule_t *rule s_skb[7][i] = c_skb[7][i]; } - for (u32 i = lid; i < 256; i += lsz) - { - s_ascii_to_ebcdic_pc[i] = c_ascii_to_ebcdic_pc[i]; - } - - barrier (CLK_LOCAL_MEM_FENCE); - if (gid >= gid_max) return; /** @@ -838,7 +822,7 @@ __kernel void m08500_s04 (__global pw_t *pws, __global const kernel_rule_t *rule u32x key[2]; - transform_racf_key (w0[0], w0[1], key, s_ascii_to_ebcdic_pc); + transform_racf_key (w0[0], w0[1], key); const u32x c = key[0]; const u32x d = key[1]; diff --git a/OpenCL/m08500_a3-optimized.cl b/OpenCL/m08500_a3-optimized.cl index 3bdeae561..8cff7bcc2 100644 --- a/OpenCL/m08500_a3-optimized.cl +++ b/OpenCL/m08500_a3-optimized.cl @@ -506,20 +506,20 @@ void _des_crypt_keysetup (u32x c, u32x d, u32x Kc[16], u32x Kd[16], __local u32 } } -void transform_racf_key (const u32x w0, const u32x w1, u32x key[2], __local u32 *s_ascii_to_ebcdic_pc) +void transform_racf_key (const u32x w0, const u32x w1, u32x key[2]) { - key[0] = BOX1 (((w0 >> 0) & 0xff), s_ascii_to_ebcdic_pc) << 0 - | BOX1 (((w0 >> 8) & 0xff), s_ascii_to_ebcdic_pc) << 8 - | BOX1 (((w0 >> 16) & 0xff), s_ascii_to_ebcdic_pc) << 16 - | BOX1 (((w0 >> 24) & 0xff), s_ascii_to_ebcdic_pc) << 24; + key[0] = BOX1 (((w0 >> 0) & 0xff), c_ascii_to_ebcdic_pc) << 0 + | BOX1 (((w0 >> 8) & 0xff), c_ascii_to_ebcdic_pc) << 8 + | BOX1 (((w0 >> 16) & 0xff), c_ascii_to_ebcdic_pc) << 16 + | BOX1 (((w0 >> 24) & 0xff), c_ascii_to_ebcdic_pc) << 24; - key[1] = BOX1 (((w1 >> 0) & 0xff), s_ascii_to_ebcdic_pc) << 0 - | BOX1 (((w1 >> 8) & 0xff), s_ascii_to_ebcdic_pc) << 8 - | BOX1 (((w1 >> 16) & 0xff), s_ascii_to_ebcdic_pc) << 16 - | BOX1 (((w1 >> 24) & 0xff), s_ascii_to_ebcdic_pc) << 24; + key[1] = BOX1 (((w1 >> 0) & 0xff), c_ascii_to_ebcdic_pc) << 0 + | BOX1 (((w1 >> 8) & 0xff), c_ascii_to_ebcdic_pc) << 8 + | BOX1 (((w1 >> 16) & 0xff), c_ascii_to_ebcdic_pc) << 16 + | BOX1 (((w1 >> 24) & 0xff), c_ascii_to_ebcdic_pc) << 24; } -void m08500m (__local u32 (*s_SPtrans)[64], __local u32 (*s_skb)[64], __local u32 *s_ascii_to_ebcdic_pc, u32 w[16], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +void m08500m (__local u32 (*s_SPtrans)[64], __local u32 (*s_skb)[64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -557,7 +557,7 @@ void m08500m (__local u32 (*s_SPtrans)[64], __local u32 (*s_skb)[64], __local u3 u32x key[2]; - transform_racf_key (w0, w1, key, s_ascii_to_ebcdic_pc); + transform_racf_key (w0, w1, key); const u32x c = key[0]; const u32x d = key[1]; @@ -582,7 +582,7 @@ void m08500m (__local u32 (*s_SPtrans)[64], __local u32 (*s_skb)[64], __local u3 } } -void m08500s (__local u32 (*s_SPtrans)[64], __local u32 (*s_skb)[64], __local u32 *s_ascii_to_ebcdic_pc, u32 w[16], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +void m08500s (__local u32 (*s_SPtrans)[64], __local u32 (*s_skb)[64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -632,7 +632,7 @@ void m08500s (__local u32 (*s_SPtrans)[64], __local u32 (*s_skb)[64], __local u3 u32x key[2]; - transform_racf_key (w0, w1, key, s_ascii_to_ebcdic_pc); + transform_racf_key (w0, w1, key); const u32x c = key[0]; const u32x d = key[1]; @@ -671,7 +671,6 @@ __kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rule * shared */ - __local u32 s_ascii_to_ebcdic_pc[256]; __local u32 s_SPtrans[8][64]; __local u32 s_skb[8][64]; @@ -696,11 +695,6 @@ __kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rule s_skb[7][i] = c_skb[7][i]; } - for (u32 i = lid; i < 256; i += lsz) - { - s_ascii_to_ebcdic_pc[i] = c_ascii_to_ebcdic_pc[i]; - } - barrier (CLK_LOCAL_MEM_FENCE); if (gid >= gid_max) return; @@ -734,7 +728,7 @@ __kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m08500m (s_SPtrans, s_skb, s_ascii_to_ebcdic_pc, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); + m08500m (s_SPtrans, s_skb, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); } __kernel void m08500_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -759,7 +753,6 @@ __kernel void m08500_s04 (__global pw_t *pws, __global const kernel_rule_t *rule * shared */ - __local u32 s_ascii_to_ebcdic_pc[256]; __local u32 s_SPtrans[8][64]; __local u32 s_skb[8][64]; @@ -784,11 +777,6 @@ __kernel void m08500_s04 (__global pw_t *pws, __global const kernel_rule_t *rule s_skb[7][i] = c_skb[7][i]; } - for (u32 i = lid; i < 256; i += lsz) - { - s_ascii_to_ebcdic_pc[i] = c_ascii_to_ebcdic_pc[i]; - } - barrier (CLK_LOCAL_MEM_FENCE); if (gid >= gid_max) return; @@ -822,7 +810,7 @@ __kernel void m08500_s04 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m08500s (s_SPtrans, s_skb, s_ascii_to_ebcdic_pc, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); + m08500s (s_SPtrans, s_skb, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); } __kernel void m08500_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) From a85be1d0f0d6e44760a08c68d2ec8c0591ed6c4d Mon Sep 17 00:00:00 2001 From: jsteube Date: Mon, 24 Jul 2017 14:46:58 +0200 Subject: [PATCH 20/75] Fix some const keywords in inc_truecrypt_xts.cl --- OpenCL/inc_truecrypt_xts.cl | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/OpenCL/inc_truecrypt_xts.cl b/OpenCL/inc_truecrypt_xts.cl index eab39e80e..6bf6684d2 100644 --- a/OpenCL/inc_truecrypt_xts.cl +++ b/OpenCL/inc_truecrypt_xts.cl @@ -150,7 +150,7 @@ void twofish256_decrypt_xts_next (const u32 *in, u32 *out, u32 *T, u32 *sk, u32 // 512 bit -int verify_header_aes (__global tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4, SHM_TYPE u32 *s_td0, SHM_TYPE u32 *s_td1, SHM_TYPE u32 *s_td2, SHM_TYPE u32 *s_td3, SHM_TYPE u32 *s_td4) +int verify_header_aes (__global const tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4, SHM_TYPE u32 *s_td0, SHM_TYPE u32 *s_td1, SHM_TYPE u32 *s_td2, SHM_TYPE u32 *s_td3, SHM_TYPE u32 *s_td4) { u32 ks_aes[60]; @@ -206,7 +206,7 @@ int verify_header_aes (__global tc_t *esalt_bufs, const u32 *ukey1, const u32 *u return 1; } -int verify_header_serpent (__global tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2) +int verify_header_serpent (__global const tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2) { u32 ks_serpent[140]; @@ -262,7 +262,7 @@ int verify_header_serpent (__global tc_t *esalt_bufs, const u32 *ukey1, const u3 return 1; } -int verify_header_twofish (__global tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2) +int verify_header_twofish (__global const tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2) { u32 sk_twofish[4]; u32 lk_twofish[40]; @@ -321,7 +321,7 @@ int verify_header_twofish (__global tc_t *esalt_bufs, const u32 *ukey1, const u3 // 1024 bit -int verify_header_aes_twofish (__global tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2, const u32 *ukey3, const u32 *ukey4, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4, SHM_TYPE u32 *s_td0, SHM_TYPE u32 *s_td1, SHM_TYPE u32 *s_td2, SHM_TYPE u32 *s_td3, SHM_TYPE u32 *s_td4) +int verify_header_aes_twofish (__global const tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2, const u32 *ukey3, const u32 *ukey4, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4, SHM_TYPE u32 *s_td0, SHM_TYPE u32 *s_td1, SHM_TYPE u32 *s_td2, SHM_TYPE u32 *s_td3, SHM_TYPE u32 *s_td4) { u32 ks_aes[60]; @@ -384,7 +384,7 @@ int verify_header_aes_twofish (__global tc_t *esalt_bufs, const u32 *ukey1, cons return 1; } -int verify_header_serpent_aes (__global tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2, const u32 *ukey3, const u32 *ukey4, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4, SHM_TYPE u32 *s_td0, SHM_TYPE u32 *s_td1, SHM_TYPE u32 *s_td2, SHM_TYPE u32 *s_td3, SHM_TYPE u32 *s_td4) +int verify_header_serpent_aes (__global const tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2, const u32 *ukey3, const u32 *ukey4, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4, SHM_TYPE u32 *s_td0, SHM_TYPE u32 *s_td1, SHM_TYPE u32 *s_td2, SHM_TYPE u32 *s_td3, SHM_TYPE u32 *s_td4) { u32 ks_serpent[140]; u32 ks_aes[60]; @@ -445,7 +445,7 @@ int verify_header_serpent_aes (__global tc_t *esalt_bufs, const u32 *ukey1, cons return 1; } -int verify_header_twofish_serpent (__global tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2, const u32 *ukey3, const u32 *ukey4) +int verify_header_twofish_serpent (__global const tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2, const u32 *ukey3, const u32 *ukey4) { u32 sk_twofish[4]; u32 lk_twofish[40]; @@ -510,7 +510,7 @@ int verify_header_twofish_serpent (__global tc_t *esalt_bufs, const u32 *ukey1, // 1536 bit -int verify_header_aes_twofish_serpent (__global tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2, const u32 *ukey3, const u32 *ukey4, const u32 *ukey5, const u32 *ukey6, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4, SHM_TYPE u32 *s_td0, SHM_TYPE u32 *s_td1, SHM_TYPE u32 *s_td2, SHM_TYPE u32 *s_td3, SHM_TYPE u32 *s_td4) +int verify_header_aes_twofish_serpent (__global const tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2, const u32 *ukey3, const u32 *ukey4, const u32 *ukey5, const u32 *ukey6, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4, SHM_TYPE u32 *s_td0, SHM_TYPE u32 *s_td1, SHM_TYPE u32 *s_td2, SHM_TYPE u32 *s_td3, SHM_TYPE u32 *s_td4) { u32 ks_aes[60]; @@ -579,7 +579,7 @@ int verify_header_aes_twofish_serpent (__global tc_t *esalt_bufs, const u32 *uke return 1; } -int verify_header_serpent_twofish_aes (__global tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2, const u32 *ukey3, const u32 *ukey4, const u32 *ukey5, const u32 *ukey6, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4, SHM_TYPE u32 *s_td0, SHM_TYPE u32 *s_td1, SHM_TYPE u32 *s_td2, SHM_TYPE u32 *s_td3, SHM_TYPE u32 *s_td4) +int verify_header_serpent_twofish_aes (__global const tc_t *esalt_bufs, const u32 *ukey1, const u32 *ukey2, const u32 *ukey3, const u32 *ukey4, const u32 *ukey5, const u32 *ukey6, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4, SHM_TYPE u32 *s_td0, SHM_TYPE u32 *s_td1, SHM_TYPE u32 *s_td2, SHM_TYPE u32 *s_td3, SHM_TYPE u32 *s_td4) { u32 ks_serpent[140]; From 33804110d135f1b03caccf58d80ff389f9697b3a Mon Sep 17 00:00:00 2001 From: jsteube Date: Fri, 28 Jul 2017 00:38:17 +0200 Subject: [PATCH 21/75] Update default environment variables --- src/shared.c | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/src/shared.c b/src/shared.c index 73d17646d..83442586d 100644 --- a/src/shared.c +++ b/src/shared.c @@ -335,32 +335,14 @@ void setup_environment_variables () } if (getenv ("GPU_FORCE_64BIT_PTR") == NULL) - putenv ((char *) "GPU_FORCE_64BIT_PTR=1"); - - if (getenv ("GPU_MAX_ALLOC_PERCENT") == NULL) - putenv ((char *) "GPU_MAX_ALLOC_PERCENT=100"); - - if (getenv ("GPU_SINGLE_ALLOC_PERCENT") == NULL) - putenv ((char *) "GPU_SINGLE_ALLOC_PERCENT=100"); - - if (getenv ("GPU_MAX_HEAP_SIZE") == NULL) - putenv ((char *) "GPU_MAX_HEAP_SIZE=100"); - - if (getenv ("CPU_FORCE_64BIT_PTR") == NULL) - putenv ((char *) "CPU_FORCE_64BIT_PTR=1"); - - if (getenv ("CPU_MAX_ALLOC_PERCENT") == NULL) - putenv ((char *) "CPU_MAX_ALLOC_PERCENT=100"); - - if (getenv ("CPU_SINGLE_ALLOC_PERCENT") == NULL) - putenv ((char *) "CPU_SINGLE_ALLOC_PERCENT=100"); - - if (getenv ("CPU_MAX_HEAP_SIZE") == NULL) - putenv ((char *) "CPU_MAX_HEAP_SIZE=100"); + putenv ((char *) "GPU_FORCE_64BIT_PTR=0"); if (getenv ("GPU_USE_SYNC_OBJECTS") == NULL) putenv ((char *) "GPU_USE_SYNC_OBJECTS=1"); + if (getenv ("OCL_CODE_CACHE_ENABLE") == NULL) + putenv ((char *) "OCL_CODE_CACHE_ENABLE=0"); + if (getenv ("CUDA_CACHE_DISABLE") == NULL) putenv ((char *) "CUDA_CACHE_DISABLE=1"); From a0b30dc9a37cc475bca2d4a892f783bf5da4b349 Mon Sep 17 00:00:00 2001 From: jsteube Date: Fri, 28 Jul 2017 01:33:29 +0200 Subject: [PATCH 22/75] Forcing OpenCL 1.2 no longer needed; all OpenCL runtimes updated --- src/opencl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/opencl.c b/src/opencl.c index b9f6f8265..a37be66ae 100644 --- a/src/opencl.c +++ b/src/opencl.c @@ -4213,9 +4213,9 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) char build_opts_new[1024] = { 0 }; #if defined (DEBUG) - snprintf (build_opts_new, sizeof (build_opts_new) - 1, "%s -D VENDOR_ID=%u -D CUDA_ARCH=%u -D VECT_SIZE=%u -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D _unroll -cl-std=CL1.2", build_opts, device_param->platform_vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->vector_width, (u32) device_param->device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, hashconfig->kern_type); + snprintf (build_opts_new, sizeof (build_opts_new) - 1, "%s -D VENDOR_ID=%u -D CUDA_ARCH=%u -D VECT_SIZE=%u -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D _unroll", build_opts, device_param->platform_vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->vector_width, (u32) device_param->device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, hashconfig->kern_type); #else - snprintf (build_opts_new, sizeof (build_opts_new) - 1, "%s -D VENDOR_ID=%u -D CUDA_ARCH=%u -D VECT_SIZE=%u -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D _unroll -cl-std=CL1.2 -w", build_opts, device_param->platform_vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->vector_width, (u32) device_param->device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, hashconfig->kern_type); + snprintf (build_opts_new, sizeof (build_opts_new) - 1, "%s -D VENDOR_ID=%u -D CUDA_ARCH=%u -D VECT_SIZE=%u -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D _unroll -w", build_opts, device_param->platform_vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->vector_width, (u32) device_param->device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, hashconfig->kern_type); #endif if (device_param->device_type & CL_DEVICE_TYPE_CPU) From 332396a0036ef1e3407462acb815e8a21fc3dc52 Mon Sep 17 00:00:00 2001 From: jsteube Date: Fri, 28 Jul 2017 02:28:52 +0200 Subject: [PATCH 23/75] Fix SCRYPT on ROCm --- OpenCL/m08900.cl | 26 ++++++++++++-------------- OpenCL/m15700.cl | 26 ++++++++++++-------------- src/shared.c | 6 ------ 3 files changed, 24 insertions(+), 34 deletions(-) diff --git a/OpenCL/m08900.cl b/OpenCL/m08900.cl index 402c2e0c0..9d65e1a71 100644 --- a/OpenCL/m08900.cl +++ b/OpenCL/m08900.cl @@ -138,6 +138,16 @@ void scrypt_smix (uint4 *X, uint4 *T, __global uint4 *V0, __global uint4 *V1, __ const u32 xd4 = x / 4; const u32 xm4 = x & 3; + __global uint4 *V; + + switch (xm4) + { + case 0: V = V0; break; + case 1: V = V1; break; + case 2: V = V2; break; + case 3: V = V3; break; + } + #ifdef _unroll #pragma unroll #endif @@ -156,13 +166,7 @@ void scrypt_smix (uint4 *X, uint4 *T, __global uint4 *V0, __global uint4 *V1, __ for (u32 y = 0; y < ySIZE; y++) { - switch (xm4) - { - case 0: for (u32 z = 0; z < zSIZE; z++) V0[CO] = X[z]; break; - case 1: for (u32 z = 0; z < zSIZE; z++) V1[CO] = X[z]; break; - case 2: for (u32 z = 0; z < zSIZE; z++) V2[CO] = X[z]; break; - case 3: for (u32 z = 0; z < zSIZE; z++) V3[CO] = X[z]; break; - } + for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z]; for (u32 i = 0; i < SCRYPT_TMTO; i++) salsa_r (X); } @@ -175,13 +179,7 @@ void scrypt_smix (uint4 *X, uint4 *T, __global uint4 *V0, __global uint4 *V1, __ const u32 km = k - (y * SCRYPT_TMTO); - switch (xm4) - { - case 0: for (u32 z = 0; z < zSIZE; z++) T[z] = V0[CO]; break; - case 1: for (u32 z = 0; z < zSIZE; z++) T[z] = V1[CO]; break; - case 2: for (u32 z = 0; z < zSIZE; z++) T[z] = V2[CO]; break; - case 3: for (u32 z = 0; z < zSIZE; z++) T[z] = V3[CO]; break; - } + for (u32 z = 0; z < zSIZE; z++) T[z] = V[CO]; for (u32 i = 0; i < km; i++) salsa_r (T); diff --git a/OpenCL/m15700.cl b/OpenCL/m15700.cl index 57a33dc1b..a1971093f 100644 --- a/OpenCL/m15700.cl +++ b/OpenCL/m15700.cl @@ -138,6 +138,16 @@ void scrypt_smix (uint4 *X, uint4 *T, __global uint4 *V0, __global uint4 *V1, __ const u32 xd4 = x / 4; const u32 xm4 = x & 3; + __global uint4 *V; + + switch (xm4) + { + case 0: V = V0; break; + case 1: V = V1; break; + case 2: V = V2; break; + case 3: V = V3; break; + } + #ifdef _unroll #pragma unroll #endif @@ -156,13 +166,7 @@ void scrypt_smix (uint4 *X, uint4 *T, __global uint4 *V0, __global uint4 *V1, __ for (u32 y = 0; y < ySIZE; y++) { - switch (xm4) - { - case 0: for (u32 z = 0; z < zSIZE; z++) V0[CO] = X[z]; break; - case 1: for (u32 z = 0; z < zSIZE; z++) V1[CO] = X[z]; break; - case 2: for (u32 z = 0; z < zSIZE; z++) V2[CO] = X[z]; break; - case 3: for (u32 z = 0; z < zSIZE; z++) V3[CO] = X[z]; break; - } + for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z]; for (u32 i = 0; i < SCRYPT_TMTO; i++) salsa_r (X); } @@ -175,13 +179,7 @@ void scrypt_smix (uint4 *X, uint4 *T, __global uint4 *V0, __global uint4 *V1, __ const u32 km = k - (y * SCRYPT_TMTO); - switch (xm4) - { - case 0: for (u32 z = 0; z < zSIZE; z++) T[z] = V0[CO]; break; - case 1: for (u32 z = 0; z < zSIZE; z++) T[z] = V1[CO]; break; - case 2: for (u32 z = 0; z < zSIZE; z++) T[z] = V2[CO]; break; - case 3: for (u32 z = 0; z < zSIZE; z++) T[z] = V3[CO]; break; - } + for (u32 z = 0; z < zSIZE; z++) T[z] = V[CO]; for (u32 i = 0; i < km; i++) salsa_r (T); diff --git a/src/shared.c b/src/shared.c index 83442586d..9ea3d575b 100644 --- a/src/shared.c +++ b/src/shared.c @@ -334,12 +334,6 @@ void setup_environment_variables () putenv ((char *) "DISPLAY=:0"); } - if (getenv ("GPU_FORCE_64BIT_PTR") == NULL) - putenv ((char *) "GPU_FORCE_64BIT_PTR=0"); - - if (getenv ("GPU_USE_SYNC_OBJECTS") == NULL) - putenv ((char *) "GPU_USE_SYNC_OBJECTS=1"); - if (getenv ("OCL_CODE_CACHE_ENABLE") == NULL) putenv ((char *) "OCL_CODE_CACHE_ENABLE=0"); From 3c530a48c4cb1f5131a3bfab69dd481309548aa8 Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Thu, 27 Jul 2017 17:14:55 -0700 Subject: [PATCH 24/75] get_random_num: Simplify random() is available everywhere except Windows and DOS. Also switch Windows to rand_s. --- src/Makefile | 2 ++ src/shared.c | 21 +++++---------------- 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/src/Makefile b/src/Makefile index e936a399c..168feb17c 100644 --- a/src/Makefile +++ b/src/Makefile @@ -250,11 +250,13 @@ CFLAGS_CROSS_WIN32 += -I$(OPENCL_HEADERS_KHRONOS)/ CFLAGS_CROSS_WIN32 += -I$(WIN_ICONV_32)/include/ CFLAGS_CROSS_WIN32 += -m32 CFLAGS_CROSS_WIN32 += -DWITH_HWMON +CFLAGS_CROSS_WIN32 += -D_CRT_RAND_S CFLAGS_CROSS_WIN64 := $(CFLAGS) CFLAGS_CROSS_WIN64 += -I$(OPENCL_HEADERS_KHRONOS)/ CFLAGS_CROSS_WIN64 += -I$(WIN_ICONV_64)/include/ CFLAGS_CROSS_WIN64 += -m64 CFLAGS_CROSS_WIN64 += -DWITH_HWMON +CFLAGS_CROSS_WIN64 += -D_CRT_RAND_S LFLAGS_CROSS_LINUX32 := $(LFLAGS) LFLAGS_CROSS_LINUX32 += -lpthread diff --git a/src/shared.c b/src/shared.c index 83442586d..c49ee3ed8 100644 --- a/src/shared.c +++ b/src/shared.c @@ -379,27 +379,16 @@ u32 get_random_num (const u32 min, const u32 max) if (low == 0) return (0); - #if defined (__linux__) + #if defined (_WIN) - u32 data; + u32 r; + rand_s(&r); - FILE *fp = fopen ("/dev/urandom", "rb"); - - if (fp == NULL) return (0); - - const int nread = fread (&data, sizeof (u32), 1, fp); - - fclose (fp); - - if (nread != 1) return 0; - - u64 r = data % low; r += min; - - return (u32) r; + return ((r % (max - min)) + min); #else - return (((u32) rand () % (max - min)) + min); + return (((u32) random () % (max - min)) + min); #endif } From e0a565234ab403d794b26661fd05775f942ffd1d Mon Sep 17 00:00:00 2001 From: jsteube Date: Fri, 28 Jul 2017 20:52:53 +0200 Subject: [PATCH 25/75] Optimized -m 7700 for ROCm --- OpenCL/m07700_a0-optimized.cl | 8 ++++---- OpenCL/m07700_a1-optimized.cl | 8 ++++---- OpenCL/m07700_a3-optimized.cl | 8 ++++---- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/OpenCL/m07700_a0-optimized.cl b/OpenCL/m07700_a0-optimized.cl index ce1601902..2fa57cdbc 100644 --- a/OpenCL/m07700_a0-optimized.cl +++ b/OpenCL/m07700_a0-optimized.cl @@ -329,7 +329,7 @@ __kernel void m07700_m04 (__global pw_t *pws, __global const kernel_rule_t *rule t[14] = pw_salt_len * 8; t[15] = 0; - PUTCHAR (t, pw_salt_len, 0x80); + append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, pw_salt_len); /** * md5 @@ -415,7 +415,7 @@ __kernel void m07700_m04 (__global pw_t *pws, __global const kernel_rule_t *rule const u32 sum20 = walld0rf_magic (w0, out_len, salt_buf0, salt_len, a, b, c, d, t); - PUTCHAR (t, sum20, 0x80); + append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, sum20); t[14] = sum20 * 8; @@ -644,7 +644,7 @@ __kernel void m07700_s04 (__global pw_t *pws, __global const kernel_rule_t *rule t[14] = pw_salt_len * 8; t[15] = 0; - PUTCHAR (t, pw_salt_len, 0x80); + append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, pw_salt_len); /** * md5 @@ -730,7 +730,7 @@ __kernel void m07700_s04 (__global pw_t *pws, __global const kernel_rule_t *rule const u32 sum20 = walld0rf_magic (w0, out_len, salt_buf0, salt_len, a, b, c, d, t); - PUTCHAR (t, sum20, 0x80); + append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, sum20); t[14] = sum20 * 8; diff --git a/OpenCL/m07700_a1-optimized.cl b/OpenCL/m07700_a1-optimized.cl index 122a2b86e..cdc24160a 100644 --- a/OpenCL/m07700_a1-optimized.cl +++ b/OpenCL/m07700_a1-optimized.cl @@ -370,7 +370,7 @@ __kernel void m07700_m04 (__global pw_t *pws, __global const kernel_rule_t *rule t[14] = pw_salt_len * 8; t[15] = 0; - PUTCHAR (t, pw_salt_len, 0x80); + append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, pw_salt_len); /** * md5 @@ -456,7 +456,7 @@ __kernel void m07700_m04 (__global pw_t *pws, __global const kernel_rule_t *rule const u32 sum20 = walld0rf_magic (w0, pw_len, salt_buf0, salt_len, a, b, c, d, t); - PUTCHAR (t, sum20, 0x80); + append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, sum20); t[14] = sum20 * 8; @@ -728,7 +728,7 @@ __kernel void m07700_s04 (__global pw_t *pws, __global const kernel_rule_t *rule t[14] = pw_salt_len * 8; t[15] = 0; - PUTCHAR (t, pw_salt_len, 0x80); + append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, pw_salt_len); /** * md5 @@ -814,7 +814,7 @@ __kernel void m07700_s04 (__global pw_t *pws, __global const kernel_rule_t *rule const u32 sum20 = walld0rf_magic (w0, pw_len, salt_buf0, salt_len, a, b, c, d, t); - PUTCHAR (t, sum20, 0x80); + append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, sum20); t[14] = sum20 * 8; diff --git a/OpenCL/m07700_a3-optimized.cl b/OpenCL/m07700_a3-optimized.cl index d5387f9ed..81214dbce 100644 --- a/OpenCL/m07700_a3-optimized.cl +++ b/OpenCL/m07700_a3-optimized.cl @@ -296,7 +296,7 @@ void m07700m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl t[14] = pw_salt_len * 8; t[15] = 0; - PUTCHAR (t, pw_salt_len, 0x80); + append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, pw_salt_len); /** * md5 @@ -382,7 +382,7 @@ void m07700m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl const u32 sum20 = walld0rf_magic (w0, pw_len, salt_buf0, salt_len, a, b, c, d, t); - PUTCHAR (t, sum20, 0x80); + append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, sum20); t[14] = sum20 * 8; @@ -572,7 +572,7 @@ void m07700s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl t[14] = pw_salt_len * 8; t[15] = 0; - PUTCHAR (t, pw_salt_len, 0x80); + append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, pw_salt_len); /** * md5 @@ -658,7 +658,7 @@ void m07700s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl const u32 sum20 = walld0rf_magic (w0, pw_len, salt_buf0, salt_len, a, b, c, d, t); - PUTCHAR (t, sum20, 0x80); + append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, sum20); t[14] = sum20 * 8; From b541e46b9b156c31bb4d3adde6aedce4e07b704a Mon Sep 17 00:00:00 2001 From: jsteube Date: Sat, 29 Jul 2017 18:53:08 +0200 Subject: [PATCH 26/75] Add pure kernels for Half MD5 --- OpenCL/m05100_a0.cl | 142 +++++++++++++++++++++++++++++++++++++++++ OpenCL/m05100_a1.cl | 119 ++++++++++++++++++++++++++++++++++ OpenCL/m05100_a3.cl | 152 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 413 insertions(+) create mode 100644 OpenCL/m05100_a0.cl create mode 100644 OpenCL/m05100_a1.cl create mode 100644 OpenCL/m05100_a3.cl diff --git a/OpenCL/m05100_a0.cl b/OpenCL/m05100_a0.cl new file mode 100644 index 000000000..9da9df5b7 --- /dev/null +++ b/OpenCL/m05100_a0.cl @@ -0,0 +1,142 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" + +__kernel void m05100_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_ctx_t ctx; + + md5_init (&ctx); + + md5_update (&ctx, w, pw_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + u32 z = 0; + + COMPARE_M_SCALAR (r0, r1, z, z); + COMPARE_M_SCALAR (r1, r2, z, z); + COMPARE_M_SCALAR (r2, r3, z, z); + } +} + +__kernel void m05100_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + 0, + 0 + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_ctx_t ctx; + + md5_init (&ctx); + + md5_update (&ctx, w, pw_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + u32 z = 0; + + COMPARE_S_SCALAR (r0, r1, z, z); + COMPARE_S_SCALAR (r1, r2, z, z); + COMPARE_S_SCALAR (r2, r3, z, z); + } +} diff --git a/OpenCL/m05100_a1.cl b/OpenCL/m05100_a1.cl new file mode 100644 index 000000000..ead0e7190 --- /dev/null +++ b/OpenCL/m05100_a1.cl @@ -0,0 +1,119 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" + +__kernel void m05100_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md5_ctx_t ctx = ctx0; + + md5_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + u32 z = 0; + + COMPARE_M_SCALAR (r0, r1, z, z); + COMPARE_M_SCALAR (r1, r2, z, z); + COMPARE_M_SCALAR (r2, r3, z, z); + } +} + + +__kernel void m05100_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + 0, + 0 + }; + + /** + * base + */ + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md5_ctx_t ctx = ctx0; + + md5_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + u32 z = 0; + + COMPARE_S_SCALAR (r0, r1, z, z); + COMPARE_S_SCALAR (r1, r2, z, z); + COMPARE_S_SCALAR (r2, r3, z, z); + } +} diff --git a/OpenCL/m05100_a3.cl b/OpenCL/m05100_a3.cl new file mode 100644 index 000000000..e8a25bdbc --- /dev/null +++ b/OpenCL/m05100_a3.cl @@ -0,0 +1,152 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_md5.cl" + +__kernel void m05100_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md5_ctx_vector_t ctx; + + md5_init_vector (&ctx); + + md5_update_vector (&ctx, w, pw_len); + + md5_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + u32x z = 0; + + COMPARE_M_SIMD (r0, r1, z, z); + COMPARE_M_SIMD (r1, r2, z, z); + COMPARE_M_SIMD (r2, r3, z, z); + } +} + +__kernel void m05100_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + 0, + 0 + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md5_ctx_vector_t ctx; + + md5_init_vector (&ctx); + + md5_update_vector (&ctx, w, pw_len); + + md5_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + u32x z = 0; + + COMPARE_S_SIMD (r0, r1, z, z); + COMPARE_S_SIMD (r1, r2, z, z); + COMPARE_S_SIMD (r2, r3, z, z); + } +} From 942b7068be4fda43b0523af3d019dbf329d5a05a Mon Sep 17 00:00:00 2001 From: jsteube Date: Sat, 29 Jul 2017 23:19:15 +0200 Subject: [PATCH 27/75] Add pure kernels for IKE-PSK MD5 --- OpenCL/m05300_a0.cl | 194 ++++++++++++++++++++++++++++++++++++ OpenCL/m05300_a1.cl | 232 ++++++++++++++++++++++++++++++++++++++++++++ OpenCL/m05300_a3.cl | 204 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 630 insertions(+) create mode 100644 OpenCL/m05300_a0.cl create mode 100644 OpenCL/m05300_a1.cl create mode 100644 OpenCL/m05300_a3.cl diff --git a/OpenCL/m05300_a0.cl b/OpenCL/m05300_a0.cl new file mode 100644 index 000000000..5bcee3ed5 --- /dev/null +++ b/OpenCL/m05300_a0.cl @@ -0,0 +1,194 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" + +__kernel void m05300_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_hmac_ctx_t ctx0; + + md5_hmac_init (&ctx0, w, pw_len); + + md5_hmac_update_global (&ctx0, ikepsk_bufs[digests_offset].nr_buf, ikepsk_bufs[digests_offset].nr_len); + + md5_hmac_final (&ctx0); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_vector_64 (&ctx, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); + + md5_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m05300_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_hmac_ctx_t ctx0; + + md5_hmac_init (&ctx0, w, pw_len); + + md5_hmac_update_global (&ctx0, ikepsk_bufs[digests_offset].nr_buf, ikepsk_bufs[digests_offset].nr_len); + + md5_hmac_final (&ctx0); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_vector_64 (&ctx, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); + + md5_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m05300_a1.cl b/OpenCL/m05300_a1.cl new file mode 100644 index 000000000..141e20207 --- /dev/null +++ b/OpenCL/m05300_a1.cl @@ -0,0 +1,232 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" + +__kernel void m05300_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + const u32 comb_len = combs_buf[il_pos].pw_len; + + u32 c[64]; + + #ifdef _unroll + #pragma unroll + #endif + for (int idx = 0; idx < 64; idx++) + { + c[idx] = combs_buf[il_pos].i[idx]; + } + + switch_buffer_by_offset_1x64_le_S (c, pw_len); + + #ifdef _unroll + #pragma unroll + #endif + for (int i = 0; i < 64; i++) + { + c[i] |= w[i]; + } + + md5_hmac_ctx_vector_t ctx0; + + md5_hmac_init_vector (&ctx0, c, pw_len + comb_len); + + md5_hmac_update_global (&ctx0, ikepsk_bufs[digests_offset].nr_buf, ikepsk_bufs[digests_offset].nr_len); + + md5_hmac_final (&ctx0); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_vector_64 (&ctx, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); + + md5_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m05300_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + const u32 comb_len = combs_buf[il_pos].pw_len; + + u32 c[64]; + + #ifdef _unroll + #pragma unroll + #endif + for (int idx = 0; idx < 64; idx++) + { + c[idx] = combs_buf[il_pos].i[idx]; + } + + switch_buffer_by_offset_1x64_le_S (c, pw_len); + + #ifdef _unroll + #pragma unroll + #endif + for (int i = 0; i < 64; i++) + { + c[i] |= w[i]; + } + + md5_hmac_ctx_vector_t ctx0; + + md5_hmac_init_vector (&ctx0, c, pw_len + comb_len); + + md5_hmac_update_global (&ctx0, ikepsk_bufs[digests_offset].nr_buf, ikepsk_bufs[digests_offset].nr_len); + + md5_hmac_final (&ctx0); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_vector_64 (&ctx, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); + + md5_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m05300_a3.cl b/OpenCL/m05300_a3.cl new file mode 100644 index 000000000..55fc8c1fb --- /dev/null +++ b/OpenCL/m05300_a3.cl @@ -0,0 +1,204 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_md5.cl" + +__kernel void m05300_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + u32 w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32 w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32 w0lr = w0l | w0r; + + w[0] = w0lr; + + md5_hmac_ctx_t ctx0; + + md5_hmac_init (&ctx0, w, pw_len); + + md5_hmac_update_global (&ctx0, ikepsk_bufs[digests_offset].nr_buf, ikepsk_bufs[digests_offset].nr_len); + + md5_hmac_final (&ctx0); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_vector_64 (&ctx, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); + + md5_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m05300_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + u32 w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32 w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32 w0lr = w0l | w0r; + + w[0] = w0lr; + + md5_hmac_ctx_t ctx0; + + md5_hmac_init (&ctx0, w, pw_len); + + md5_hmac_update_global (&ctx0, ikepsk_bufs[digests_offset].nr_buf, ikepsk_bufs[digests_offset].nr_len); + + md5_hmac_final (&ctx0); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_vector_64 (&ctx, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); + + md5_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} From 973678241100219b7a4ec6287d57aedfe26b3182 Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Sat, 29 Jul 2017 15:54:20 -0700 Subject: [PATCH 28/75] Switch back to rand(). rand_s is broken in AppVeyor. --- src/Makefile | 2 -- src/shared.c | 5 +---- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/src/Makefile b/src/Makefile index 168feb17c..e936a399c 100644 --- a/src/Makefile +++ b/src/Makefile @@ -250,13 +250,11 @@ CFLAGS_CROSS_WIN32 += -I$(OPENCL_HEADERS_KHRONOS)/ CFLAGS_CROSS_WIN32 += -I$(WIN_ICONV_32)/include/ CFLAGS_CROSS_WIN32 += -m32 CFLAGS_CROSS_WIN32 += -DWITH_HWMON -CFLAGS_CROSS_WIN32 += -D_CRT_RAND_S CFLAGS_CROSS_WIN64 := $(CFLAGS) CFLAGS_CROSS_WIN64 += -I$(OPENCL_HEADERS_KHRONOS)/ CFLAGS_CROSS_WIN64 += -I$(WIN_ICONV_64)/include/ CFLAGS_CROSS_WIN64 += -m64 CFLAGS_CROSS_WIN64 += -DWITH_HWMON -CFLAGS_CROSS_WIN64 += -D_CRT_RAND_S LFLAGS_CROSS_LINUX32 := $(LFLAGS) LFLAGS_CROSS_LINUX32 += -lpthread diff --git a/src/shared.c b/src/shared.c index c49ee3ed8..e11f4df1d 100644 --- a/src/shared.c +++ b/src/shared.c @@ -381,10 +381,7 @@ u32 get_random_num (const u32 min, const u32 max) #if defined (_WIN) - u32 r; - rand_s(&r); - - return ((r % (max - min)) + min); + return (((u32) rand () % (max - min)) + min); #else From 5d137ba03682d010db0108b1a844ec3ec94e0f02 Mon Sep 17 00:00:00 2001 From: jsteube Date: Mon, 31 Jul 2017 09:36:57 +0200 Subject: [PATCH 29/75] Add pure kernels for IKE-PSK SHA1 --- OpenCL/m05300_a3.cl | 6 +- OpenCL/m05400_a0.cl | 194 ++++++++++++++++++++++++++++++++++++ OpenCL/m05400_a1.cl | 232 ++++++++++++++++++++++++++++++++++++++++++++ OpenCL/m05400_a3.cl | 204 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 633 insertions(+), 3 deletions(-) create mode 100644 OpenCL/m05400_a0.cl create mode 100644 OpenCL/m05400_a1.cl create mode 100644 OpenCL/m05400_a3.cl diff --git a/OpenCL/m05300_a3.cl b/OpenCL/m05300_a3.cl index 55fc8c1fb..29e83a70d 100644 --- a/OpenCL/m05300_a3.cl +++ b/OpenCL/m05300_a3.cl @@ -10,7 +10,7 @@ #include "inc_hash_functions.cl" #include "inc_types.cl" #include "inc_common.cl" -#include "inc_simd.cl" +#include "inc_scalar.cl" #include "inc_hash_md5.cl" __kernel void m05300_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -98,7 +98,7 @@ __kernel void m05300_mxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 r2 = ctx.opad.h[DGST_R2]; const u32 r3 = ctx.opad.h[DGST_R3]; - COMPARE_M_SIMD (r0, r1, r2, r3); + COMPARE_M_SCALAR (r0, r1, r2, r3); } } @@ -199,6 +199,6 @@ __kernel void m05300_sxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 r2 = ctx.opad.h[DGST_R2]; const u32 r3 = ctx.opad.h[DGST_R3]; - COMPARE_S_SIMD (r0, r1, r2, r3); + COMPARE_S_SCALAR (r0, r1, r2, r3); } } diff --git a/OpenCL/m05400_a0.cl b/OpenCL/m05400_a0.cl new file mode 100644 index 000000000..ccafef2f5 --- /dev/null +++ b/OpenCL/m05400_a0.cl @@ -0,0 +1,194 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m05400_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_hmac_ctx_t ctx0; + + sha1_hmac_init_swap (&ctx0, w, pw_len); + + sha1_hmac_update_global_swap (&ctx0, ikepsk_bufs[digests_offset].nr_buf, ikepsk_bufs[digests_offset].nr_len); + + sha1_hmac_final (&ctx0); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = ctx0.opad.h[4]; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_hmac_ctx_t ctx; + + sha1_hmac_init_vector_64 (&ctx, w0, w1, w2, w3); + + sha1_hmac_update_global_swap (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); + + sha1_hmac_final (&ctx); + + const u32x r0 = ctx.opad.h[DGST_R0]; + const u32x r1 = ctx.opad.h[DGST_R1]; + const u32x r2 = ctx.opad.h[DGST_R2]; + const u32x r3 = ctx.opad.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m05400_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_hmac_ctx_t ctx0; + + sha1_hmac_init_swap (&ctx0, w, pw_len); + + sha1_hmac_update_global_swap (&ctx0, ikepsk_bufs[digests_offset].nr_buf, ikepsk_bufs[digests_offset].nr_len); + + sha1_hmac_final (&ctx0); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = ctx0.opad.h[4]; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_hmac_ctx_t ctx; + + sha1_hmac_init_vector_64 (&ctx, w0, w1, w2, w3); + + sha1_hmac_update_global_swap (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); + + sha1_hmac_final (&ctx); + + const u32x r0 = ctx.opad.h[DGST_R0]; + const u32x r1 = ctx.opad.h[DGST_R1]; + const u32x r2 = ctx.opad.h[DGST_R2]; + const u32x r3 = ctx.opad.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m05400_a1.cl b/OpenCL/m05400_a1.cl new file mode 100644 index 000000000..415d537d1 --- /dev/null +++ b/OpenCL/m05400_a1.cl @@ -0,0 +1,232 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m05400_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = swap32_S (pws[gid].i[idx]); + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + const u32 comb_len = combs_buf[il_pos].pw_len; + + u32 c[64]; + + #ifdef _unroll + #pragma unroll + #endif + for (int idx = 0; idx < 64; idx++) + { + c[idx] = swap32_S (combs_buf[il_pos].i[idx]); + } + + switch_buffer_by_offset_1x64_be_S (c, pw_len); + + #ifdef _unroll + #pragma unroll + #endif + for (int i = 0; i < 64; i++) + { + c[i] |= w[i]; + } + + sha1_hmac_ctx_t ctx0; + + sha1_hmac_init (&ctx0, c, pw_len + comb_len); + + sha1_hmac_update_global_swap (&ctx0, ikepsk_bufs[digests_offset].nr_buf, ikepsk_bufs[digests_offset].nr_len); + + sha1_hmac_final (&ctx0); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = ctx0.opad.h[4]; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_hmac_ctx_t ctx; + + sha1_hmac_init_vector_64 (&ctx, w0, w1, w2, w3); + + sha1_hmac_update_global_swap (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); + + sha1_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m05400_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = swap32_S (pws[gid].i[idx]); + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + const u32 comb_len = combs_buf[il_pos].pw_len; + + u32 c[64]; + + #ifdef _unroll + #pragma unroll + #endif + for (int idx = 0; idx < 64; idx++) + { + c[idx] = swap32_S (combs_buf[il_pos].i[idx]); + } + + switch_buffer_by_offset_1x64_be_S (c, pw_len); + + #ifdef _unroll + #pragma unroll + #endif + for (int i = 0; i < 64; i++) + { + c[i] |= w[i]; + } + + sha1_hmac_ctx_t ctx0; + + sha1_hmac_init (&ctx0, c, pw_len + comb_len); + + sha1_hmac_update_global_swap (&ctx0, ikepsk_bufs[digests_offset].nr_buf, ikepsk_bufs[digests_offset].nr_len); + + sha1_hmac_final (&ctx0); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = ctx0.opad.h[4]; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_hmac_ctx_t ctx; + + sha1_hmac_init_vector_64 (&ctx, w0, w1, w2, w3); + + sha1_hmac_update_global_swap (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); + + sha1_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m05400_a3.cl b/OpenCL/m05400_a3.cl new file mode 100644 index 000000000..ee4ab2b53 --- /dev/null +++ b/OpenCL/m05400_a3.cl @@ -0,0 +1,204 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m05400_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0lr = w0l | w0r; + + w[0] = w0lr; + + sha1_hmac_ctx_t ctx0; + + sha1_hmac_init (&ctx0, w, pw_len); + + sha1_hmac_update_global_swap (&ctx0, ikepsk_bufs[digests_offset].nr_buf, ikepsk_bufs[digests_offset].nr_len); + + sha1_hmac_final (&ctx0); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = ctx0.opad.h[4]; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_hmac_ctx_t ctx; + + sha1_hmac_init_vector_64 (&ctx, w0, w1, w2, w3); + + sha1_hmac_update_global_swap (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); + + sha1_hmac_final (&ctx); + + const u32x r0 = ctx.opad.h[DGST_R0]; + const u32x r1 = ctx.opad.h[DGST_R1]; + const u32x r2 = ctx.opad.h[DGST_R2]; + const u32x r3 = ctx.opad.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m05400_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const ikepsk_t *ikepsk_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0lr = w0l | w0r; + + w[0] = w0lr; + + sha1_hmac_ctx_t ctx0; + + sha1_hmac_init (&ctx0, w, pw_len); + + sha1_hmac_update_global_swap (&ctx0, ikepsk_bufs[digests_offset].nr_buf, ikepsk_bufs[digests_offset].nr_len); + + sha1_hmac_final (&ctx0); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = ctx0.opad.h[4]; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_hmac_ctx_t ctx; + + sha1_hmac_init_vector_64 (&ctx, w0, w1, w2, w3); + + sha1_hmac_update_global_swap (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); + + sha1_hmac_final (&ctx); + + const u32x r0 = ctx.opad.h[DGST_R0]; + const u32x r1 = ctx.opad.h[DGST_R1]; + const u32x r2 = ctx.opad.h[DGST_R2]; + const u32x r3 = ctx.opad.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} From 50aeade65c203ed390411203cf8e297d39743603 Mon Sep 17 00:00:00 2001 From: jsteube Date: Mon, 31 Jul 2017 10:23:04 +0200 Subject: [PATCH 30/75] Add pure kernels for NetNTLMv1 / NetNTLMv1+ESS --- OpenCL/inc_hash_md4.cl | 1 + OpenCL/m05500_a0.cl | 771 ++++++++++++++++++++++++++++++++++++++++ OpenCL/m05500_a1.cl | 746 +++++++++++++++++++++++++++++++++++++++ OpenCL/m05500_a3.cl | 780 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 2298 insertions(+) create mode 100644 OpenCL/m05500_a0.cl create mode 100644 OpenCL/m05500_a1.cl create mode 100644 OpenCL/m05500_a3.cl diff --git a/OpenCL/inc_hash_md4.cl b/OpenCL/inc_hash_md4.cl index a9383a5da..5bc5d5978 100644 --- a/OpenCL/inc_hash_md4.cl +++ b/OpenCL/inc_hash_md4.cl @@ -1047,6 +1047,7 @@ void md4_hmac_update_utf16le_swap (md4_hmac_ctx_t *ctx, const u32 *w, const int { md4_update_utf16le_swap (&ctx->ipad, w, len); } + void md4_hmac_update_global (md4_hmac_ctx_t *ctx, const __global u32 *w, const int len) { md4_update_global (&ctx->ipad, w, len); diff --git a/OpenCL/m05500_a0.cl b/OpenCL/m05500_a0.cl new file mode 100644 index 000000000..3c8a790ae --- /dev/null +++ b/OpenCL/m05500_a0.cl @@ -0,0 +1,771 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_md4.cl" + + +#define PERM_OP(a,b,tt,n,m) \ +{ \ + tt = a >> n; \ + tt = tt ^ b; \ + tt = tt & m; \ + b = b ^ tt; \ + tt = tt << n; \ + a = a ^ tt; \ +} + +#define HPERM_OP(a,tt,n,m) \ +{ \ + tt = a << (16 + n); \ + tt = tt ^ a; \ + tt = tt & m; \ + a = a ^ tt; \ + tt = tt >> (16 + n); \ + a = a ^ tt; \ +} + +__constant u32a c_SPtrans[8][64] = +{ + { + 0x02080800, 0x00080000, 0x02000002, 0x02080802, + 0x02000000, 0x00080802, 0x00080002, 0x02000002, + 0x00080802, 0x02080800, 0x02080000, 0x00000802, + 0x02000802, 0x02000000, 0x00000000, 0x00080002, + 0x00080000, 0x00000002, 0x02000800, 0x00080800, + 0x02080802, 0x02080000, 0x00000802, 0x02000800, + 0x00000002, 0x00000800, 0x00080800, 0x02080002, + 0x00000800, 0x02000802, 0x02080002, 0x00000000, + 0x00000000, 0x02080802, 0x02000800, 0x00080002, + 0x02080800, 0x00080000, 0x00000802, 0x02000800, + 0x02080002, 0x00000800, 0x00080800, 0x02000002, + 0x00080802, 0x00000002, 0x02000002, 0x02080000, + 0x02080802, 0x00080800, 0x02080000, 0x02000802, + 0x02000000, 0x00000802, 0x00080002, 0x00000000, + 0x00080000, 0x02000000, 0x02000802, 0x02080800, + 0x00000002, 0x02080002, 0x00000800, 0x00080802, + }, + { + 0x40108010, 0x00000000, 0x00108000, 0x40100000, + 0x40000010, 0x00008010, 0x40008000, 0x00108000, + 0x00008000, 0x40100010, 0x00000010, 0x40008000, + 0x00100010, 0x40108000, 0x40100000, 0x00000010, + 0x00100000, 0x40008010, 0x40100010, 0x00008000, + 0x00108010, 0x40000000, 0x00000000, 0x00100010, + 0x40008010, 0x00108010, 0x40108000, 0x40000010, + 0x40000000, 0x00100000, 0x00008010, 0x40108010, + 0x00100010, 0x40108000, 0x40008000, 0x00108010, + 0x40108010, 0x00100010, 0x40000010, 0x00000000, + 0x40000000, 0x00008010, 0x00100000, 0x40100010, + 0x00008000, 0x40000000, 0x00108010, 0x40008010, + 0x40108000, 0x00008000, 0x00000000, 0x40000010, + 0x00000010, 0x40108010, 0x00108000, 0x40100000, + 0x40100010, 0x00100000, 0x00008010, 0x40008000, + 0x40008010, 0x00000010, 0x40100000, 0x00108000, + }, + { + 0x04000001, 0x04040100, 0x00000100, 0x04000101, + 0x00040001, 0x04000000, 0x04000101, 0x00040100, + 0x04000100, 0x00040000, 0x04040000, 0x00000001, + 0x04040101, 0x00000101, 0x00000001, 0x04040001, + 0x00000000, 0x00040001, 0x04040100, 0x00000100, + 0x00000101, 0x04040101, 0x00040000, 0x04000001, + 0x04040001, 0x04000100, 0x00040101, 0x04040000, + 0x00040100, 0x00000000, 0x04000000, 0x00040101, + 0x04040100, 0x00000100, 0x00000001, 0x00040000, + 0x00000101, 0x00040001, 0x04040000, 0x04000101, + 0x00000000, 0x04040100, 0x00040100, 0x04040001, + 0x00040001, 0x04000000, 0x04040101, 0x00000001, + 0x00040101, 0x04000001, 0x04000000, 0x04040101, + 0x00040000, 0x04000100, 0x04000101, 0x00040100, + 0x04000100, 0x00000000, 0x04040001, 0x00000101, + 0x04000001, 0x00040101, 0x00000100, 0x04040000, + }, + { + 0x00401008, 0x10001000, 0x00000008, 0x10401008, + 0x00000000, 0x10400000, 0x10001008, 0x00400008, + 0x10401000, 0x10000008, 0x10000000, 0x00001008, + 0x10000008, 0x00401008, 0x00400000, 0x10000000, + 0x10400008, 0x00401000, 0x00001000, 0x00000008, + 0x00401000, 0x10001008, 0x10400000, 0x00001000, + 0x00001008, 0x00000000, 0x00400008, 0x10401000, + 0x10001000, 0x10400008, 0x10401008, 0x00400000, + 0x10400008, 0x00001008, 0x00400000, 0x10000008, + 0x00401000, 0x10001000, 0x00000008, 0x10400000, + 0x10001008, 0x00000000, 0x00001000, 0x00400008, + 0x00000000, 0x10400008, 0x10401000, 0x00001000, + 0x10000000, 0x10401008, 0x00401008, 0x00400000, + 0x10401008, 0x00000008, 0x10001000, 0x00401008, + 0x00400008, 0x00401000, 0x10400000, 0x10001008, + 0x00001008, 0x10000000, 0x10000008, 0x10401000, + }, + { + 0x08000000, 0x00010000, 0x00000400, 0x08010420, + 0x08010020, 0x08000400, 0x00010420, 0x08010000, + 0x00010000, 0x00000020, 0x08000020, 0x00010400, + 0x08000420, 0x08010020, 0x08010400, 0x00000000, + 0x00010400, 0x08000000, 0x00010020, 0x00000420, + 0x08000400, 0x00010420, 0x00000000, 0x08000020, + 0x00000020, 0x08000420, 0x08010420, 0x00010020, + 0x08010000, 0x00000400, 0x00000420, 0x08010400, + 0x08010400, 0x08000420, 0x00010020, 0x08010000, + 0x00010000, 0x00000020, 0x08000020, 0x08000400, + 0x08000000, 0x00010400, 0x08010420, 0x00000000, + 0x00010420, 0x08000000, 0x00000400, 0x00010020, + 0x08000420, 0x00000400, 0x00000000, 0x08010420, + 0x08010020, 0x08010400, 0x00000420, 0x00010000, + 0x00010400, 0x08010020, 0x08000400, 0x00000420, + 0x00000020, 0x00010420, 0x08010000, 0x08000020, + }, + { + 0x80000040, 0x00200040, 0x00000000, 0x80202000, + 0x00200040, 0x00002000, 0x80002040, 0x00200000, + 0x00002040, 0x80202040, 0x00202000, 0x80000000, + 0x80002000, 0x80000040, 0x80200000, 0x00202040, + 0x00200000, 0x80002040, 0x80200040, 0x00000000, + 0x00002000, 0x00000040, 0x80202000, 0x80200040, + 0x80202040, 0x80200000, 0x80000000, 0x00002040, + 0x00000040, 0x00202000, 0x00202040, 0x80002000, + 0x00002040, 0x80000000, 0x80002000, 0x00202040, + 0x80202000, 0x00200040, 0x00000000, 0x80002000, + 0x80000000, 0x00002000, 0x80200040, 0x00200000, + 0x00200040, 0x80202040, 0x00202000, 0x00000040, + 0x80202040, 0x00202000, 0x00200000, 0x80002040, + 0x80000040, 0x80200000, 0x00202040, 0x00000000, + 0x00002000, 0x80000040, 0x80002040, 0x80202000, + 0x80200000, 0x00002040, 0x00000040, 0x80200040, + }, + { + 0x00004000, 0x00000200, 0x01000200, 0x01000004, + 0x01004204, 0x00004004, 0x00004200, 0x00000000, + 0x01000000, 0x01000204, 0x00000204, 0x01004000, + 0x00000004, 0x01004200, 0x01004000, 0x00000204, + 0x01000204, 0x00004000, 0x00004004, 0x01004204, + 0x00000000, 0x01000200, 0x01000004, 0x00004200, + 0x01004004, 0x00004204, 0x01004200, 0x00000004, + 0x00004204, 0x01004004, 0x00000200, 0x01000000, + 0x00004204, 0x01004000, 0x01004004, 0x00000204, + 0x00004000, 0x00000200, 0x01000000, 0x01004004, + 0x01000204, 0x00004204, 0x00004200, 0x00000000, + 0x00000200, 0x01000004, 0x00000004, 0x01000200, + 0x00000000, 0x01000204, 0x01000200, 0x00004200, + 0x00000204, 0x00004000, 0x01004204, 0x01000000, + 0x01004200, 0x00000004, 0x00004004, 0x01004204, + 0x01000004, 0x01004200, 0x01004000, 0x00004004, + }, + { + 0x20800080, 0x20820000, 0x00020080, 0x00000000, + 0x20020000, 0x00800080, 0x20800000, 0x20820080, + 0x00000080, 0x20000000, 0x00820000, 0x00020080, + 0x00820080, 0x20020080, 0x20000080, 0x20800000, + 0x00020000, 0x00820080, 0x00800080, 0x20020000, + 0x20820080, 0x20000080, 0x00000000, 0x00820000, + 0x20000000, 0x00800000, 0x20020080, 0x20800080, + 0x00800000, 0x00020000, 0x20820000, 0x00000080, + 0x00800000, 0x00020000, 0x20000080, 0x20820080, + 0x00020080, 0x20000000, 0x00000000, 0x00820000, + 0x20800080, 0x20020080, 0x20020000, 0x00800080, + 0x20820000, 0x00000080, 0x00800080, 0x20020000, + 0x20820080, 0x00800000, 0x20800000, 0x20000080, + 0x00820000, 0x00020080, 0x20020080, 0x20800000, + 0x00000080, 0x20820000, 0x00820080, 0x00000000, + 0x20000000, 0x20800080, 0x00020000, 0x00820080, + } +}; + +__constant u32a c_skb[8][64] = +{ + { + 0x00000000, 0x00000010, 0x20000000, 0x20000010, + 0x00010000, 0x00010010, 0x20010000, 0x20010010, + 0x00000800, 0x00000810, 0x20000800, 0x20000810, + 0x00010800, 0x00010810, 0x20010800, 0x20010810, + 0x00000020, 0x00000030, 0x20000020, 0x20000030, + 0x00010020, 0x00010030, 0x20010020, 0x20010030, + 0x00000820, 0x00000830, 0x20000820, 0x20000830, + 0x00010820, 0x00010830, 0x20010820, 0x20010830, + 0x00080000, 0x00080010, 0x20080000, 0x20080010, + 0x00090000, 0x00090010, 0x20090000, 0x20090010, + 0x00080800, 0x00080810, 0x20080800, 0x20080810, + 0x00090800, 0x00090810, 0x20090800, 0x20090810, + 0x00080020, 0x00080030, 0x20080020, 0x20080030, + 0x00090020, 0x00090030, 0x20090020, 0x20090030, + 0x00080820, 0x00080830, 0x20080820, 0x20080830, + 0x00090820, 0x00090830, 0x20090820, 0x20090830, + }, + { + 0x00000000, 0x02000000, 0x00002000, 0x02002000, + 0x00200000, 0x02200000, 0x00202000, 0x02202000, + 0x00000004, 0x02000004, 0x00002004, 0x02002004, + 0x00200004, 0x02200004, 0x00202004, 0x02202004, + 0x00000400, 0x02000400, 0x00002400, 0x02002400, + 0x00200400, 0x02200400, 0x00202400, 0x02202400, + 0x00000404, 0x02000404, 0x00002404, 0x02002404, + 0x00200404, 0x02200404, 0x00202404, 0x02202404, + 0x10000000, 0x12000000, 0x10002000, 0x12002000, + 0x10200000, 0x12200000, 0x10202000, 0x12202000, + 0x10000004, 0x12000004, 0x10002004, 0x12002004, + 0x10200004, 0x12200004, 0x10202004, 0x12202004, + 0x10000400, 0x12000400, 0x10002400, 0x12002400, + 0x10200400, 0x12200400, 0x10202400, 0x12202400, + 0x10000404, 0x12000404, 0x10002404, 0x12002404, + 0x10200404, 0x12200404, 0x10202404, 0x12202404, + }, + { + 0x00000000, 0x00000001, 0x00040000, 0x00040001, + 0x01000000, 0x01000001, 0x01040000, 0x01040001, + 0x00000002, 0x00000003, 0x00040002, 0x00040003, + 0x01000002, 0x01000003, 0x01040002, 0x01040003, + 0x00000200, 0x00000201, 0x00040200, 0x00040201, + 0x01000200, 0x01000201, 0x01040200, 0x01040201, + 0x00000202, 0x00000203, 0x00040202, 0x00040203, + 0x01000202, 0x01000203, 0x01040202, 0x01040203, + 0x08000000, 0x08000001, 0x08040000, 0x08040001, + 0x09000000, 0x09000001, 0x09040000, 0x09040001, + 0x08000002, 0x08000003, 0x08040002, 0x08040003, + 0x09000002, 0x09000003, 0x09040002, 0x09040003, + 0x08000200, 0x08000201, 0x08040200, 0x08040201, + 0x09000200, 0x09000201, 0x09040200, 0x09040201, + 0x08000202, 0x08000203, 0x08040202, 0x08040203, + 0x09000202, 0x09000203, 0x09040202, 0x09040203, + }, + { + 0x00000000, 0x00100000, 0x00000100, 0x00100100, + 0x00000008, 0x00100008, 0x00000108, 0x00100108, + 0x00001000, 0x00101000, 0x00001100, 0x00101100, + 0x00001008, 0x00101008, 0x00001108, 0x00101108, + 0x04000000, 0x04100000, 0x04000100, 0x04100100, + 0x04000008, 0x04100008, 0x04000108, 0x04100108, + 0x04001000, 0x04101000, 0x04001100, 0x04101100, + 0x04001008, 0x04101008, 0x04001108, 0x04101108, + 0x00020000, 0x00120000, 0x00020100, 0x00120100, + 0x00020008, 0x00120008, 0x00020108, 0x00120108, + 0x00021000, 0x00121000, 0x00021100, 0x00121100, + 0x00021008, 0x00121008, 0x00021108, 0x00121108, + 0x04020000, 0x04120000, 0x04020100, 0x04120100, + 0x04020008, 0x04120008, 0x04020108, 0x04120108, + 0x04021000, 0x04121000, 0x04021100, 0x04121100, + 0x04021008, 0x04121008, 0x04021108, 0x04121108, + }, + { + 0x00000000, 0x10000000, 0x00010000, 0x10010000, + 0x00000004, 0x10000004, 0x00010004, 0x10010004, + 0x20000000, 0x30000000, 0x20010000, 0x30010000, + 0x20000004, 0x30000004, 0x20010004, 0x30010004, + 0x00100000, 0x10100000, 0x00110000, 0x10110000, + 0x00100004, 0x10100004, 0x00110004, 0x10110004, + 0x20100000, 0x30100000, 0x20110000, 0x30110000, + 0x20100004, 0x30100004, 0x20110004, 0x30110004, + 0x00001000, 0x10001000, 0x00011000, 0x10011000, + 0x00001004, 0x10001004, 0x00011004, 0x10011004, + 0x20001000, 0x30001000, 0x20011000, 0x30011000, + 0x20001004, 0x30001004, 0x20011004, 0x30011004, + 0x00101000, 0x10101000, 0x00111000, 0x10111000, + 0x00101004, 0x10101004, 0x00111004, 0x10111004, + 0x20101000, 0x30101000, 0x20111000, 0x30111000, + 0x20101004, 0x30101004, 0x20111004, 0x30111004, + }, + { + 0x00000000, 0x08000000, 0x00000008, 0x08000008, + 0x00000400, 0x08000400, 0x00000408, 0x08000408, + 0x00020000, 0x08020000, 0x00020008, 0x08020008, + 0x00020400, 0x08020400, 0x00020408, 0x08020408, + 0x00000001, 0x08000001, 0x00000009, 0x08000009, + 0x00000401, 0x08000401, 0x00000409, 0x08000409, + 0x00020001, 0x08020001, 0x00020009, 0x08020009, + 0x00020401, 0x08020401, 0x00020409, 0x08020409, + 0x02000000, 0x0A000000, 0x02000008, 0x0A000008, + 0x02000400, 0x0A000400, 0x02000408, 0x0A000408, + 0x02020000, 0x0A020000, 0x02020008, 0x0A020008, + 0x02020400, 0x0A020400, 0x02020408, 0x0A020408, + 0x02000001, 0x0A000001, 0x02000009, 0x0A000009, + 0x02000401, 0x0A000401, 0x02000409, 0x0A000409, + 0x02020001, 0x0A020001, 0x02020009, 0x0A020009, + 0x02020401, 0x0A020401, 0x02020409, 0x0A020409, + }, + { + 0x00000000, 0x00000100, 0x00080000, 0x00080100, + 0x01000000, 0x01000100, 0x01080000, 0x01080100, + 0x00000010, 0x00000110, 0x00080010, 0x00080110, + 0x01000010, 0x01000110, 0x01080010, 0x01080110, + 0x00200000, 0x00200100, 0x00280000, 0x00280100, + 0x01200000, 0x01200100, 0x01280000, 0x01280100, + 0x00200010, 0x00200110, 0x00280010, 0x00280110, + 0x01200010, 0x01200110, 0x01280010, 0x01280110, + 0x00000200, 0x00000300, 0x00080200, 0x00080300, + 0x01000200, 0x01000300, 0x01080200, 0x01080300, + 0x00000210, 0x00000310, 0x00080210, 0x00080310, + 0x01000210, 0x01000310, 0x01080210, 0x01080310, + 0x00200200, 0x00200300, 0x00280200, 0x00280300, + 0x01200200, 0x01200300, 0x01280200, 0x01280300, + 0x00200210, 0x00200310, 0x00280210, 0x00280310, + 0x01200210, 0x01200310, 0x01280210, 0x01280310, + }, + { + 0x00000000, 0x04000000, 0x00040000, 0x04040000, + 0x00000002, 0x04000002, 0x00040002, 0x04040002, + 0x00002000, 0x04002000, 0x00042000, 0x04042000, + 0x00002002, 0x04002002, 0x00042002, 0x04042002, + 0x00000020, 0x04000020, 0x00040020, 0x04040020, + 0x00000022, 0x04000022, 0x00040022, 0x04040022, + 0x00002020, 0x04002020, 0x00042020, 0x04042020, + 0x00002022, 0x04002022, 0x00042022, 0x04042022, + 0x00000800, 0x04000800, 0x00040800, 0x04040800, + 0x00000802, 0x04000802, 0x00040802, 0x04040802, + 0x00002800, 0x04002800, 0x00042800, 0x04042800, + 0x00002802, 0x04002802, 0x00042802, 0x04042802, + 0x00000820, 0x04000820, 0x00040820, 0x04040820, + 0x00000822, 0x04000822, 0x00040822, 0x04040822, + 0x00002820, 0x04002820, 0x00042820, 0x04042820, + 0x00002822, 0x04002822, 0x00042822, 0x04042822 + } +}; + +#if VECT_SIZE == 1 +#define BOX(i,n,S) (S)[(n)][(i)] +#elif VECT_SIZE == 2 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1]) +#elif VECT_SIZE == 4 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3]) +#elif VECT_SIZE == 8 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7]) +#elif VECT_SIZE == 16 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf]) +#endif + +void _des_crypt_encrypt (u32x iv[2], u32x data[2], u32x Kc[16], u32x Kd[16], __local u32 (*s_SPtrans)[64]) +{ + u32x r = data[0]; + u32x l = data[1]; + + #ifdef _unroll + #pragma unroll + #endif + for (u32 i = 0; i < 16; i += 2) + { + u32x u; + u32x t; + + u = Kc[i + 0] ^ rotl32 (r, 30u); + t = Kd[i + 0] ^ rotl32 (r, 26u); + + l ^= BOX (((u >> 0) & 0x3f), 0, s_SPtrans) + | BOX (((u >> 8) & 0x3f), 2, s_SPtrans) + | BOX (((u >> 16) & 0x3f), 4, s_SPtrans) + | BOX (((u >> 24) & 0x3f), 6, s_SPtrans) + | BOX (((t >> 0) & 0x3f), 1, s_SPtrans) + | BOX (((t >> 8) & 0x3f), 3, s_SPtrans) + | BOX (((t >> 16) & 0x3f), 5, s_SPtrans) + | BOX (((t >> 24) & 0x3f), 7, s_SPtrans); + + u = Kc[i + 1] ^ rotl32 (l, 30u); + t = Kd[i + 1] ^ rotl32 (l, 26u); + + r ^= BOX (((u >> 0) & 0x3f), 0, s_SPtrans) + | BOX (((u >> 8) & 0x3f), 2, s_SPtrans) + | BOX (((u >> 16) & 0x3f), 4, s_SPtrans) + | BOX (((u >> 24) & 0x3f), 6, s_SPtrans) + | BOX (((t >> 0) & 0x3f), 1, s_SPtrans) + | BOX (((t >> 8) & 0x3f), 3, s_SPtrans) + | BOX (((t >> 16) & 0x3f), 5, s_SPtrans) + | BOX (((t >> 24) & 0x3f), 7, s_SPtrans); + } + + iv[0] = l; + iv[1] = r; +} + +void _des_crypt_keysetup (u32x c, u32x d, u32x Kc[16], u32x Kd[16], __local u32 (*s_skb)[64]) +{ + u32x tt; + + PERM_OP (d, c, tt, 4, 0x0f0f0f0f); + HPERM_OP (c, tt, 2, 0xcccc0000); + HPERM_OP (d, tt, 2, 0xcccc0000); + PERM_OP (d, c, tt, 1, 0x55555555); + PERM_OP (c, d, tt, 8, 0x00ff00ff); + PERM_OP (d, c, tt, 1, 0x55555555); + + d = ((d & 0x000000ff) << 16) + | ((d & 0x0000ff00) << 0) + | ((d & 0x00ff0000) >> 16) + | ((c & 0xf0000000) >> 4); + + c = c & 0x0fffffff; + + #ifdef _unroll + #pragma unroll + #endif + for (u32 i = 0; i < 16; i++) + { + if ((i < 2) || (i == 8) || (i == 15)) + { + c = ((c >> 1) | (c << 27)); + d = ((d >> 1) | (d << 27)); + } + else + { + c = ((c >> 2) | (c << 26)); + d = ((d >> 2) | (d << 26)); + } + + c = c & 0x0fffffff; + d = d & 0x0fffffff; + + const u32x c00 = (c >> 0) & 0x0000003f; + const u32x c06 = (c >> 6) & 0x00383003; + const u32x c07 = (c >> 7) & 0x0000003c; + const u32x c13 = (c >> 13) & 0x0000060f; + const u32x c20 = (c >> 20) & 0x00000001; + + u32x s = BOX (((c00 >> 0) & 0xff), 0, s_skb) + | BOX (((c06 >> 0) & 0xff) + |((c07 >> 0) & 0xff), 1, s_skb) + | BOX (((c13 >> 0) & 0xff) + |((c06 >> 8) & 0xff), 2, s_skb) + | BOX (((c20 >> 0) & 0xff) + |((c13 >> 8) & 0xff) + |((c06 >> 16) & 0xff), 3, s_skb); + + const u32x d00 = (d >> 0) & 0x00003c3f; + const u32x d07 = (d >> 7) & 0x00003f03; + const u32x d21 = (d >> 21) & 0x0000000f; + const u32x d22 = (d >> 22) & 0x00000030; + + u32x t = BOX (((d00 >> 0) & 0xff), 4, s_skb) + | BOX (((d07 >> 0) & 0xff) + |((d00 >> 8) & 0xff), 5, s_skb) + | BOX (((d07 >> 8) & 0xff), 6, s_skb) + | BOX (((d21 >> 0) & 0xff) + |((d22 >> 0) & 0xff), 7, s_skb); + + Kc[i] = ((t << 16) | (s & 0x0000ffff)); + Kd[i] = ((s >> 16) | (t & 0xffff0000)); + } +} + +void transform_netntlmv1_key (const u32x w0, const u32x w1, u32x out[2]) +{ + u32x t[8]; + + t[0] = (w0 >> 0) & 0xff; + t[1] = (w0 >> 8) & 0xff; + t[2] = (w0 >> 16) & 0xff; + t[3] = (w0 >> 24) & 0xff; + t[4] = (w1 >> 0) & 0xff; + t[5] = (w1 >> 8) & 0xff; + t[6] = (w1 >> 16) & 0xff; + t[7] = (w1 >> 24) & 0xff; + + u32x k[8]; + + k[0] = (t[0] >> 0); + k[1] = (t[0] << 7) | (t[1] >> 1); + k[2] = (t[1] << 6) | (t[2] >> 2); + k[3] = (t[2] << 5) | (t[3] >> 3); + k[4] = (t[3] << 4) | (t[4] >> 4); + k[5] = (t[4] << 3) | (t[5] >> 5); + k[6] = (t[5] << 2) | (t[6] >> 6); + k[7] = (t[6] << 1); + + out[0] = ((k[0] & 0xff) << 0) + | ((k[1] & 0xff) << 8) + | ((k[2] & 0xff) << 16) + | ((k[3] & 0xff) << 24); + + out[1] = ((k[4] & 0xff) << 0) + | ((k[5] & 0xff) << 8) + | ((k[6] & 0xff) << 16) + | ((k[7] & 0xff) << 24); +} + +__kernel void m05500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * sbox, kbox + */ + + __local u32 s_SPtrans[8][64]; + __local u32 s_skb[8][64]; + + for (u32 i = lid; i < 64; i += lsz) + { + s_SPtrans[0][i] = c_SPtrans[0][i]; + s_SPtrans[1][i] = c_SPtrans[1][i]; + s_SPtrans[2][i] = c_SPtrans[2][i]; + s_SPtrans[3][i] = c_SPtrans[3][i]; + s_SPtrans[4][i] = c_SPtrans[4][i]; + s_SPtrans[5][i] = c_SPtrans[5][i]; + s_SPtrans[6][i] = c_SPtrans[6][i]; + s_SPtrans[7][i] = c_SPtrans[7][i]; + + s_skb[0][i] = c_skb[0][i]; + s_skb[1][i] = c_skb[1][i]; + s_skb[2][i] = c_skb[2][i]; + s_skb[3][i] = c_skb[3][i]; + s_skb[4][i] = c_skb[4][i]; + s_skb[5][i] = c_skb[5][i]; + s_skb[6][i] = c_skb[6][i]; + s_skb[7][i] = c_skb[7][i]; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * salt + */ + + const u32 s0 = salt_bufs[salt_pos].salt_buf[0]; + const u32 s1 = salt_bufs[salt_pos].salt_buf[1]; + const u32 s2 = salt_bufs[salt_pos].salt_buf[2]; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md4_ctx_t ctx; + + md4_init (&ctx); + + md4_update_utf16le (&ctx, w, pw_len); + + md4_final (&ctx); + + const u32 a = ctx.h[0]; + const u32 b = ctx.h[1]; + const u32 c = ctx.h[2]; + const u32 d = ctx.h[3]; + + if ((d >> 16) != s2) continue; + + /** + * DES1 + */ + + u32 key[2]; + + transform_netntlmv1_key (a, b, key); + + u32 Kc[16]; + u32 Kd[16]; + + _des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb); + + u32 data[2]; + + data[0] = s0; + data[1] = s1; + + u32 out1[2]; + + _des_crypt_encrypt (out1, data, Kc, Kd, s_SPtrans); + + /** + * DES2 + */ + + transform_netntlmv1_key (((b >> 24) | (c << 8)), ((c >> 24) | (d << 8)), key); + + _des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb); + + u32 out2[2]; + + _des_crypt_encrypt (out2, data, Kc, Kd, s_SPtrans); + + const u32 r0 = out1[0]; + const u32 r1 = out1[1]; + const u32 r2 = out2[0]; + const u32 r3 = out2[1]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m05500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * sbox, kbox + */ + + __local u32 s_SPtrans[8][64]; + __local u32 s_skb[8][64]; + + for (u32 i = lid; i < 64; i += lsz) + { + s_SPtrans[0][i] = c_SPtrans[0][i]; + s_SPtrans[1][i] = c_SPtrans[1][i]; + s_SPtrans[2][i] = c_SPtrans[2][i]; + s_SPtrans[3][i] = c_SPtrans[3][i]; + s_SPtrans[4][i] = c_SPtrans[4][i]; + s_SPtrans[5][i] = c_SPtrans[5][i]; + s_SPtrans[6][i] = c_SPtrans[6][i]; + s_SPtrans[7][i] = c_SPtrans[7][i]; + + s_skb[0][i] = c_skb[0][i]; + s_skb[1][i] = c_skb[1][i]; + s_skb[2][i] = c_skb[2][i]; + s_skb[3][i] = c_skb[3][i]; + s_skb[4][i] = c_skb[4][i]; + s_skb[5][i] = c_skb[5][i]; + s_skb[6][i] = c_skb[6][i]; + s_skb[7][i] = c_skb[7][i]; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * salt + */ + + const u32 s0 = salt_bufs[salt_pos].salt_buf[0]; + const u32 s1 = salt_bufs[salt_pos].salt_buf[1]; + const u32 s2 = salt_bufs[salt_pos].salt_buf[2]; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md4_ctx_t ctx; + + md4_init (&ctx); + + md4_update_utf16le (&ctx, w, pw_len); + + md4_final (&ctx); + + const u32 a = ctx.h[0]; + const u32 b = ctx.h[1]; + const u32 c = ctx.h[2]; + const u32 d = ctx.h[3]; + + if ((d >> 16) != s2) continue; + + /** + * DES1 + */ + + u32 key[2]; + + transform_netntlmv1_key (a, b, key); + + u32 Kc[16]; + u32 Kd[16]; + + _des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb); + + u32 data[2]; + + data[0] = s0; + data[1] = s1; + + u32 out1[2]; + + _des_crypt_encrypt (out1, data, Kc, Kd, s_SPtrans); + + /** + * DES2 + */ + + /* + transform_netntlmv1_key (((b >> 24) | (c << 8)), ((c >> 24) | (d << 8)), key); + + _des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb); + + u32 out2[2]; + + _des_crypt_encrypt (out2, data, Kc, Kd, s_SPtrans); + */ + + const u32 r0 = out1[0]; + const u32 r1 = out1[1]; + const u32 r2 = search[2]; + const u32 r3 = search[3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m05500_a1.cl b/OpenCL/m05500_a1.cl new file mode 100644 index 000000000..34bc4711f --- /dev/null +++ b/OpenCL/m05500_a1.cl @@ -0,0 +1,746 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_md4.cl" + +#define PERM_OP(a,b,tt,n,m) \ +{ \ + tt = a >> n; \ + tt = tt ^ b; \ + tt = tt & m; \ + b = b ^ tt; \ + tt = tt << n; \ + a = a ^ tt; \ +} + +#define HPERM_OP(a,tt,n,m) \ +{ \ + tt = a << (16 + n); \ + tt = tt ^ a; \ + tt = tt & m; \ + a = a ^ tt; \ + tt = tt >> (16 + n); \ + a = a ^ tt; \ +} + +__constant u32a c_SPtrans[8][64] = +{ + { + 0x02080800, 0x00080000, 0x02000002, 0x02080802, + 0x02000000, 0x00080802, 0x00080002, 0x02000002, + 0x00080802, 0x02080800, 0x02080000, 0x00000802, + 0x02000802, 0x02000000, 0x00000000, 0x00080002, + 0x00080000, 0x00000002, 0x02000800, 0x00080800, + 0x02080802, 0x02080000, 0x00000802, 0x02000800, + 0x00000002, 0x00000800, 0x00080800, 0x02080002, + 0x00000800, 0x02000802, 0x02080002, 0x00000000, + 0x00000000, 0x02080802, 0x02000800, 0x00080002, + 0x02080800, 0x00080000, 0x00000802, 0x02000800, + 0x02080002, 0x00000800, 0x00080800, 0x02000002, + 0x00080802, 0x00000002, 0x02000002, 0x02080000, + 0x02080802, 0x00080800, 0x02080000, 0x02000802, + 0x02000000, 0x00000802, 0x00080002, 0x00000000, + 0x00080000, 0x02000000, 0x02000802, 0x02080800, + 0x00000002, 0x02080002, 0x00000800, 0x00080802, + }, + { + 0x40108010, 0x00000000, 0x00108000, 0x40100000, + 0x40000010, 0x00008010, 0x40008000, 0x00108000, + 0x00008000, 0x40100010, 0x00000010, 0x40008000, + 0x00100010, 0x40108000, 0x40100000, 0x00000010, + 0x00100000, 0x40008010, 0x40100010, 0x00008000, + 0x00108010, 0x40000000, 0x00000000, 0x00100010, + 0x40008010, 0x00108010, 0x40108000, 0x40000010, + 0x40000000, 0x00100000, 0x00008010, 0x40108010, + 0x00100010, 0x40108000, 0x40008000, 0x00108010, + 0x40108010, 0x00100010, 0x40000010, 0x00000000, + 0x40000000, 0x00008010, 0x00100000, 0x40100010, + 0x00008000, 0x40000000, 0x00108010, 0x40008010, + 0x40108000, 0x00008000, 0x00000000, 0x40000010, + 0x00000010, 0x40108010, 0x00108000, 0x40100000, + 0x40100010, 0x00100000, 0x00008010, 0x40008000, + 0x40008010, 0x00000010, 0x40100000, 0x00108000, + }, + { + 0x04000001, 0x04040100, 0x00000100, 0x04000101, + 0x00040001, 0x04000000, 0x04000101, 0x00040100, + 0x04000100, 0x00040000, 0x04040000, 0x00000001, + 0x04040101, 0x00000101, 0x00000001, 0x04040001, + 0x00000000, 0x00040001, 0x04040100, 0x00000100, + 0x00000101, 0x04040101, 0x00040000, 0x04000001, + 0x04040001, 0x04000100, 0x00040101, 0x04040000, + 0x00040100, 0x00000000, 0x04000000, 0x00040101, + 0x04040100, 0x00000100, 0x00000001, 0x00040000, + 0x00000101, 0x00040001, 0x04040000, 0x04000101, + 0x00000000, 0x04040100, 0x00040100, 0x04040001, + 0x00040001, 0x04000000, 0x04040101, 0x00000001, + 0x00040101, 0x04000001, 0x04000000, 0x04040101, + 0x00040000, 0x04000100, 0x04000101, 0x00040100, + 0x04000100, 0x00000000, 0x04040001, 0x00000101, + 0x04000001, 0x00040101, 0x00000100, 0x04040000, + }, + { + 0x00401008, 0x10001000, 0x00000008, 0x10401008, + 0x00000000, 0x10400000, 0x10001008, 0x00400008, + 0x10401000, 0x10000008, 0x10000000, 0x00001008, + 0x10000008, 0x00401008, 0x00400000, 0x10000000, + 0x10400008, 0x00401000, 0x00001000, 0x00000008, + 0x00401000, 0x10001008, 0x10400000, 0x00001000, + 0x00001008, 0x00000000, 0x00400008, 0x10401000, + 0x10001000, 0x10400008, 0x10401008, 0x00400000, + 0x10400008, 0x00001008, 0x00400000, 0x10000008, + 0x00401000, 0x10001000, 0x00000008, 0x10400000, + 0x10001008, 0x00000000, 0x00001000, 0x00400008, + 0x00000000, 0x10400008, 0x10401000, 0x00001000, + 0x10000000, 0x10401008, 0x00401008, 0x00400000, + 0x10401008, 0x00000008, 0x10001000, 0x00401008, + 0x00400008, 0x00401000, 0x10400000, 0x10001008, + 0x00001008, 0x10000000, 0x10000008, 0x10401000, + }, + { + 0x08000000, 0x00010000, 0x00000400, 0x08010420, + 0x08010020, 0x08000400, 0x00010420, 0x08010000, + 0x00010000, 0x00000020, 0x08000020, 0x00010400, + 0x08000420, 0x08010020, 0x08010400, 0x00000000, + 0x00010400, 0x08000000, 0x00010020, 0x00000420, + 0x08000400, 0x00010420, 0x00000000, 0x08000020, + 0x00000020, 0x08000420, 0x08010420, 0x00010020, + 0x08010000, 0x00000400, 0x00000420, 0x08010400, + 0x08010400, 0x08000420, 0x00010020, 0x08010000, + 0x00010000, 0x00000020, 0x08000020, 0x08000400, + 0x08000000, 0x00010400, 0x08010420, 0x00000000, + 0x00010420, 0x08000000, 0x00000400, 0x00010020, + 0x08000420, 0x00000400, 0x00000000, 0x08010420, + 0x08010020, 0x08010400, 0x00000420, 0x00010000, + 0x00010400, 0x08010020, 0x08000400, 0x00000420, + 0x00000020, 0x00010420, 0x08010000, 0x08000020, + }, + { + 0x80000040, 0x00200040, 0x00000000, 0x80202000, + 0x00200040, 0x00002000, 0x80002040, 0x00200000, + 0x00002040, 0x80202040, 0x00202000, 0x80000000, + 0x80002000, 0x80000040, 0x80200000, 0x00202040, + 0x00200000, 0x80002040, 0x80200040, 0x00000000, + 0x00002000, 0x00000040, 0x80202000, 0x80200040, + 0x80202040, 0x80200000, 0x80000000, 0x00002040, + 0x00000040, 0x00202000, 0x00202040, 0x80002000, + 0x00002040, 0x80000000, 0x80002000, 0x00202040, + 0x80202000, 0x00200040, 0x00000000, 0x80002000, + 0x80000000, 0x00002000, 0x80200040, 0x00200000, + 0x00200040, 0x80202040, 0x00202000, 0x00000040, + 0x80202040, 0x00202000, 0x00200000, 0x80002040, + 0x80000040, 0x80200000, 0x00202040, 0x00000000, + 0x00002000, 0x80000040, 0x80002040, 0x80202000, + 0x80200000, 0x00002040, 0x00000040, 0x80200040, + }, + { + 0x00004000, 0x00000200, 0x01000200, 0x01000004, + 0x01004204, 0x00004004, 0x00004200, 0x00000000, + 0x01000000, 0x01000204, 0x00000204, 0x01004000, + 0x00000004, 0x01004200, 0x01004000, 0x00000204, + 0x01000204, 0x00004000, 0x00004004, 0x01004204, + 0x00000000, 0x01000200, 0x01000004, 0x00004200, + 0x01004004, 0x00004204, 0x01004200, 0x00000004, + 0x00004204, 0x01004004, 0x00000200, 0x01000000, + 0x00004204, 0x01004000, 0x01004004, 0x00000204, + 0x00004000, 0x00000200, 0x01000000, 0x01004004, + 0x01000204, 0x00004204, 0x00004200, 0x00000000, + 0x00000200, 0x01000004, 0x00000004, 0x01000200, + 0x00000000, 0x01000204, 0x01000200, 0x00004200, + 0x00000204, 0x00004000, 0x01004204, 0x01000000, + 0x01004200, 0x00000004, 0x00004004, 0x01004204, + 0x01000004, 0x01004200, 0x01004000, 0x00004004, + }, + { + 0x20800080, 0x20820000, 0x00020080, 0x00000000, + 0x20020000, 0x00800080, 0x20800000, 0x20820080, + 0x00000080, 0x20000000, 0x00820000, 0x00020080, + 0x00820080, 0x20020080, 0x20000080, 0x20800000, + 0x00020000, 0x00820080, 0x00800080, 0x20020000, + 0x20820080, 0x20000080, 0x00000000, 0x00820000, + 0x20000000, 0x00800000, 0x20020080, 0x20800080, + 0x00800000, 0x00020000, 0x20820000, 0x00000080, + 0x00800000, 0x00020000, 0x20000080, 0x20820080, + 0x00020080, 0x20000000, 0x00000000, 0x00820000, + 0x20800080, 0x20020080, 0x20020000, 0x00800080, + 0x20820000, 0x00000080, 0x00800080, 0x20020000, + 0x20820080, 0x00800000, 0x20800000, 0x20000080, + 0x00820000, 0x00020080, 0x20020080, 0x20800000, + 0x00000080, 0x20820000, 0x00820080, 0x00000000, + 0x20000000, 0x20800080, 0x00020000, 0x00820080, + } +}; + +__constant u32a c_skb[8][64] = +{ + { + 0x00000000, 0x00000010, 0x20000000, 0x20000010, + 0x00010000, 0x00010010, 0x20010000, 0x20010010, + 0x00000800, 0x00000810, 0x20000800, 0x20000810, + 0x00010800, 0x00010810, 0x20010800, 0x20010810, + 0x00000020, 0x00000030, 0x20000020, 0x20000030, + 0x00010020, 0x00010030, 0x20010020, 0x20010030, + 0x00000820, 0x00000830, 0x20000820, 0x20000830, + 0x00010820, 0x00010830, 0x20010820, 0x20010830, + 0x00080000, 0x00080010, 0x20080000, 0x20080010, + 0x00090000, 0x00090010, 0x20090000, 0x20090010, + 0x00080800, 0x00080810, 0x20080800, 0x20080810, + 0x00090800, 0x00090810, 0x20090800, 0x20090810, + 0x00080020, 0x00080030, 0x20080020, 0x20080030, + 0x00090020, 0x00090030, 0x20090020, 0x20090030, + 0x00080820, 0x00080830, 0x20080820, 0x20080830, + 0x00090820, 0x00090830, 0x20090820, 0x20090830, + }, + { + 0x00000000, 0x02000000, 0x00002000, 0x02002000, + 0x00200000, 0x02200000, 0x00202000, 0x02202000, + 0x00000004, 0x02000004, 0x00002004, 0x02002004, + 0x00200004, 0x02200004, 0x00202004, 0x02202004, + 0x00000400, 0x02000400, 0x00002400, 0x02002400, + 0x00200400, 0x02200400, 0x00202400, 0x02202400, + 0x00000404, 0x02000404, 0x00002404, 0x02002404, + 0x00200404, 0x02200404, 0x00202404, 0x02202404, + 0x10000000, 0x12000000, 0x10002000, 0x12002000, + 0x10200000, 0x12200000, 0x10202000, 0x12202000, + 0x10000004, 0x12000004, 0x10002004, 0x12002004, + 0x10200004, 0x12200004, 0x10202004, 0x12202004, + 0x10000400, 0x12000400, 0x10002400, 0x12002400, + 0x10200400, 0x12200400, 0x10202400, 0x12202400, + 0x10000404, 0x12000404, 0x10002404, 0x12002404, + 0x10200404, 0x12200404, 0x10202404, 0x12202404, + }, + { + 0x00000000, 0x00000001, 0x00040000, 0x00040001, + 0x01000000, 0x01000001, 0x01040000, 0x01040001, + 0x00000002, 0x00000003, 0x00040002, 0x00040003, + 0x01000002, 0x01000003, 0x01040002, 0x01040003, + 0x00000200, 0x00000201, 0x00040200, 0x00040201, + 0x01000200, 0x01000201, 0x01040200, 0x01040201, + 0x00000202, 0x00000203, 0x00040202, 0x00040203, + 0x01000202, 0x01000203, 0x01040202, 0x01040203, + 0x08000000, 0x08000001, 0x08040000, 0x08040001, + 0x09000000, 0x09000001, 0x09040000, 0x09040001, + 0x08000002, 0x08000003, 0x08040002, 0x08040003, + 0x09000002, 0x09000003, 0x09040002, 0x09040003, + 0x08000200, 0x08000201, 0x08040200, 0x08040201, + 0x09000200, 0x09000201, 0x09040200, 0x09040201, + 0x08000202, 0x08000203, 0x08040202, 0x08040203, + 0x09000202, 0x09000203, 0x09040202, 0x09040203, + }, + { + 0x00000000, 0x00100000, 0x00000100, 0x00100100, + 0x00000008, 0x00100008, 0x00000108, 0x00100108, + 0x00001000, 0x00101000, 0x00001100, 0x00101100, + 0x00001008, 0x00101008, 0x00001108, 0x00101108, + 0x04000000, 0x04100000, 0x04000100, 0x04100100, + 0x04000008, 0x04100008, 0x04000108, 0x04100108, + 0x04001000, 0x04101000, 0x04001100, 0x04101100, + 0x04001008, 0x04101008, 0x04001108, 0x04101108, + 0x00020000, 0x00120000, 0x00020100, 0x00120100, + 0x00020008, 0x00120008, 0x00020108, 0x00120108, + 0x00021000, 0x00121000, 0x00021100, 0x00121100, + 0x00021008, 0x00121008, 0x00021108, 0x00121108, + 0x04020000, 0x04120000, 0x04020100, 0x04120100, + 0x04020008, 0x04120008, 0x04020108, 0x04120108, + 0x04021000, 0x04121000, 0x04021100, 0x04121100, + 0x04021008, 0x04121008, 0x04021108, 0x04121108, + }, + { + 0x00000000, 0x10000000, 0x00010000, 0x10010000, + 0x00000004, 0x10000004, 0x00010004, 0x10010004, + 0x20000000, 0x30000000, 0x20010000, 0x30010000, + 0x20000004, 0x30000004, 0x20010004, 0x30010004, + 0x00100000, 0x10100000, 0x00110000, 0x10110000, + 0x00100004, 0x10100004, 0x00110004, 0x10110004, + 0x20100000, 0x30100000, 0x20110000, 0x30110000, + 0x20100004, 0x30100004, 0x20110004, 0x30110004, + 0x00001000, 0x10001000, 0x00011000, 0x10011000, + 0x00001004, 0x10001004, 0x00011004, 0x10011004, + 0x20001000, 0x30001000, 0x20011000, 0x30011000, + 0x20001004, 0x30001004, 0x20011004, 0x30011004, + 0x00101000, 0x10101000, 0x00111000, 0x10111000, + 0x00101004, 0x10101004, 0x00111004, 0x10111004, + 0x20101000, 0x30101000, 0x20111000, 0x30111000, + 0x20101004, 0x30101004, 0x20111004, 0x30111004, + }, + { + 0x00000000, 0x08000000, 0x00000008, 0x08000008, + 0x00000400, 0x08000400, 0x00000408, 0x08000408, + 0x00020000, 0x08020000, 0x00020008, 0x08020008, + 0x00020400, 0x08020400, 0x00020408, 0x08020408, + 0x00000001, 0x08000001, 0x00000009, 0x08000009, + 0x00000401, 0x08000401, 0x00000409, 0x08000409, + 0x00020001, 0x08020001, 0x00020009, 0x08020009, + 0x00020401, 0x08020401, 0x00020409, 0x08020409, + 0x02000000, 0x0A000000, 0x02000008, 0x0A000008, + 0x02000400, 0x0A000400, 0x02000408, 0x0A000408, + 0x02020000, 0x0A020000, 0x02020008, 0x0A020008, + 0x02020400, 0x0A020400, 0x02020408, 0x0A020408, + 0x02000001, 0x0A000001, 0x02000009, 0x0A000009, + 0x02000401, 0x0A000401, 0x02000409, 0x0A000409, + 0x02020001, 0x0A020001, 0x02020009, 0x0A020009, + 0x02020401, 0x0A020401, 0x02020409, 0x0A020409, + }, + { + 0x00000000, 0x00000100, 0x00080000, 0x00080100, + 0x01000000, 0x01000100, 0x01080000, 0x01080100, + 0x00000010, 0x00000110, 0x00080010, 0x00080110, + 0x01000010, 0x01000110, 0x01080010, 0x01080110, + 0x00200000, 0x00200100, 0x00280000, 0x00280100, + 0x01200000, 0x01200100, 0x01280000, 0x01280100, + 0x00200010, 0x00200110, 0x00280010, 0x00280110, + 0x01200010, 0x01200110, 0x01280010, 0x01280110, + 0x00000200, 0x00000300, 0x00080200, 0x00080300, + 0x01000200, 0x01000300, 0x01080200, 0x01080300, + 0x00000210, 0x00000310, 0x00080210, 0x00080310, + 0x01000210, 0x01000310, 0x01080210, 0x01080310, + 0x00200200, 0x00200300, 0x00280200, 0x00280300, + 0x01200200, 0x01200300, 0x01280200, 0x01280300, + 0x00200210, 0x00200310, 0x00280210, 0x00280310, + 0x01200210, 0x01200310, 0x01280210, 0x01280310, + }, + { + 0x00000000, 0x04000000, 0x00040000, 0x04040000, + 0x00000002, 0x04000002, 0x00040002, 0x04040002, + 0x00002000, 0x04002000, 0x00042000, 0x04042000, + 0x00002002, 0x04002002, 0x00042002, 0x04042002, + 0x00000020, 0x04000020, 0x00040020, 0x04040020, + 0x00000022, 0x04000022, 0x00040022, 0x04040022, + 0x00002020, 0x04002020, 0x00042020, 0x04042020, + 0x00002022, 0x04002022, 0x00042022, 0x04042022, + 0x00000800, 0x04000800, 0x00040800, 0x04040800, + 0x00000802, 0x04000802, 0x00040802, 0x04040802, + 0x00002800, 0x04002800, 0x00042800, 0x04042800, + 0x00002802, 0x04002802, 0x00042802, 0x04042802, + 0x00000820, 0x04000820, 0x00040820, 0x04040820, + 0x00000822, 0x04000822, 0x00040822, 0x04040822, + 0x00002820, 0x04002820, 0x00042820, 0x04042820, + 0x00002822, 0x04002822, 0x00042822, 0x04042822 + } +}; + +#if VECT_SIZE == 1 +#define BOX(i,n,S) (S)[(n)][(i)] +#elif VECT_SIZE == 2 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1]) +#elif VECT_SIZE == 4 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3]) +#elif VECT_SIZE == 8 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7]) +#elif VECT_SIZE == 16 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf]) +#endif + +void _des_crypt_encrypt (u32x iv[2], u32x data[2], u32x Kc[16], u32x Kd[16], __local u32 (*s_SPtrans)[64]) +{ + u32x r = data[0]; + u32x l = data[1]; + + #ifdef _unroll + #pragma unroll + #endif + for (u32 i = 0; i < 16; i += 2) + { + u32x u; + u32x t; + + u = Kc[i + 0] ^ rotl32 (r, 30u); + t = Kd[i + 0] ^ rotl32 (r, 26u); + + l ^= BOX (((u >> 0) & 0x3f), 0, s_SPtrans) + | BOX (((u >> 8) & 0x3f), 2, s_SPtrans) + | BOX (((u >> 16) & 0x3f), 4, s_SPtrans) + | BOX (((u >> 24) & 0x3f), 6, s_SPtrans) + | BOX (((t >> 0) & 0x3f), 1, s_SPtrans) + | BOX (((t >> 8) & 0x3f), 3, s_SPtrans) + | BOX (((t >> 16) & 0x3f), 5, s_SPtrans) + | BOX (((t >> 24) & 0x3f), 7, s_SPtrans); + + u = Kc[i + 1] ^ rotl32 (l, 30u); + t = Kd[i + 1] ^ rotl32 (l, 26u); + + r ^= BOX (((u >> 0) & 0x3f), 0, s_SPtrans) + | BOX (((u >> 8) & 0x3f), 2, s_SPtrans) + | BOX (((u >> 16) & 0x3f), 4, s_SPtrans) + | BOX (((u >> 24) & 0x3f), 6, s_SPtrans) + | BOX (((t >> 0) & 0x3f), 1, s_SPtrans) + | BOX (((t >> 8) & 0x3f), 3, s_SPtrans) + | BOX (((t >> 16) & 0x3f), 5, s_SPtrans) + | BOX (((t >> 24) & 0x3f), 7, s_SPtrans); + } + + iv[0] = l; + iv[1] = r; +} + +void _des_crypt_keysetup (u32x c, u32x d, u32x Kc[16], u32x Kd[16], __local u32 (*s_skb)[64]) +{ + u32x tt; + + PERM_OP (d, c, tt, 4, 0x0f0f0f0f); + HPERM_OP (c, tt, 2, 0xcccc0000); + HPERM_OP (d, tt, 2, 0xcccc0000); + PERM_OP (d, c, tt, 1, 0x55555555); + PERM_OP (c, d, tt, 8, 0x00ff00ff); + PERM_OP (d, c, tt, 1, 0x55555555); + + d = ((d & 0x000000ff) << 16) + | ((d & 0x0000ff00) << 0) + | ((d & 0x00ff0000) >> 16) + | ((c & 0xf0000000) >> 4); + + c = c & 0x0fffffff; + + #ifdef _unroll + #pragma unroll + #endif + for (u32 i = 0; i < 16; i++) + { + if ((i < 2) || (i == 8) || (i == 15)) + { + c = ((c >> 1) | (c << 27)); + d = ((d >> 1) | (d << 27)); + } + else + { + c = ((c >> 2) | (c << 26)); + d = ((d >> 2) | (d << 26)); + } + + c = c & 0x0fffffff; + d = d & 0x0fffffff; + + const u32x c00 = (c >> 0) & 0x0000003f; + const u32x c06 = (c >> 6) & 0x00383003; + const u32x c07 = (c >> 7) & 0x0000003c; + const u32x c13 = (c >> 13) & 0x0000060f; + const u32x c20 = (c >> 20) & 0x00000001; + + u32x s = BOX (((c00 >> 0) & 0xff), 0, s_skb) + | BOX (((c06 >> 0) & 0xff) + |((c07 >> 0) & 0xff), 1, s_skb) + | BOX (((c13 >> 0) & 0xff) + |((c06 >> 8) & 0xff), 2, s_skb) + | BOX (((c20 >> 0) & 0xff) + |((c13 >> 8) & 0xff) + |((c06 >> 16) & 0xff), 3, s_skb); + + const u32x d00 = (d >> 0) & 0x00003c3f; + const u32x d07 = (d >> 7) & 0x00003f03; + const u32x d21 = (d >> 21) & 0x0000000f; + const u32x d22 = (d >> 22) & 0x00000030; + + u32x t = BOX (((d00 >> 0) & 0xff), 4, s_skb) + | BOX (((d07 >> 0) & 0xff) + |((d00 >> 8) & 0xff), 5, s_skb) + | BOX (((d07 >> 8) & 0xff), 6, s_skb) + | BOX (((d21 >> 0) & 0xff) + |((d22 >> 0) & 0xff), 7, s_skb); + + Kc[i] = ((t << 16) | (s & 0x0000ffff)); + Kd[i] = ((s >> 16) | (t & 0xffff0000)); + } +} + +void transform_netntlmv1_key (const u32x w0, const u32x w1, u32x out[2]) +{ + u32x t[8]; + + t[0] = (w0 >> 0) & 0xff; + t[1] = (w0 >> 8) & 0xff; + t[2] = (w0 >> 16) & 0xff; + t[3] = (w0 >> 24) & 0xff; + t[4] = (w1 >> 0) & 0xff; + t[5] = (w1 >> 8) & 0xff; + t[6] = (w1 >> 16) & 0xff; + t[7] = (w1 >> 24) & 0xff; + + u32x k[8]; + + k[0] = (t[0] >> 0); + k[1] = (t[0] << 7) | (t[1] >> 1); + k[2] = (t[1] << 6) | (t[2] >> 2); + k[3] = (t[2] << 5) | (t[3] >> 3); + k[4] = (t[3] << 4) | (t[4] >> 4); + k[5] = (t[4] << 3) | (t[5] >> 5); + k[6] = (t[5] << 2) | (t[6] >> 6); + k[7] = (t[6] << 1); + + out[0] = ((k[0] & 0xff) << 0) + | ((k[1] & 0xff) << 8) + | ((k[2] & 0xff) << 16) + | ((k[3] & 0xff) << 24); + + out[1] = ((k[4] & 0xff) << 0) + | ((k[5] & 0xff) << 8) + | ((k[6] & 0xff) << 16) + | ((k[7] & 0xff) << 24); +} + +__kernel void m05500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * sbox, kbox + */ + + __local u32 s_SPtrans[8][64]; + __local u32 s_skb[8][64]; + + for (u32 i = lid; i < 64; i += lsz) + { + s_SPtrans[0][i] = c_SPtrans[0][i]; + s_SPtrans[1][i] = c_SPtrans[1][i]; + s_SPtrans[2][i] = c_SPtrans[2][i]; + s_SPtrans[3][i] = c_SPtrans[3][i]; + s_SPtrans[4][i] = c_SPtrans[4][i]; + s_SPtrans[5][i] = c_SPtrans[5][i]; + s_SPtrans[6][i] = c_SPtrans[6][i]; + s_SPtrans[7][i] = c_SPtrans[7][i]; + + s_skb[0][i] = c_skb[0][i]; + s_skb[1][i] = c_skb[1][i]; + s_skb[2][i] = c_skb[2][i]; + s_skb[3][i] = c_skb[3][i]; + s_skb[4][i] = c_skb[4][i]; + s_skb[5][i] = c_skb[5][i]; + s_skb[6][i] = c_skb[6][i]; + s_skb[7][i] = c_skb[7][i]; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * salt + */ + + const u32 s0 = salt_bufs[salt_pos].salt_buf[0]; + const u32 s1 = salt_bufs[salt_pos].salt_buf[1]; + const u32 s2 = salt_bufs[salt_pos].salt_buf[2]; + + /** + * base + */ + + md4_ctx_t ctx0; + + md4_init (&ctx0); + + md4_update_global_utf16le (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md4_ctx_t ctx = ctx0; + + md4_update_global_utf16le (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md4_final (&ctx); + + const u32 a = ctx.h[0]; + const u32 b = ctx.h[1]; + const u32 c = ctx.h[2]; + const u32 d = ctx.h[3]; + + if ((d >> 16) != s2) continue; + + /** + * DES1 + */ + + u32 key[2]; + + transform_netntlmv1_key (a, b, key); + + u32 Kc[16]; + u32 Kd[16]; + + _des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb); + + u32 data[2]; + + data[0] = s0; + data[1] = s1; + + u32 out1[2]; + + _des_crypt_encrypt (out1, data, Kc, Kd, s_SPtrans); + + /** + * DES2 + */ + + transform_netntlmv1_key (((b >> 24) | (c << 8)), ((c >> 24) | (d << 8)), key); + + _des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb); + + u32 out2[2]; + + _des_crypt_encrypt (out2, data, Kc, Kd, s_SPtrans); + + const u32 r0 = out1[0]; + const u32 r1 = out1[1]; + const u32 r2 = out2[0]; + const u32 r3 = out2[1]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m05500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * sbox, kbox + */ + + __local u32 s_SPtrans[8][64]; + __local u32 s_skb[8][64]; + + for (u32 i = lid; i < 64; i += lsz) + { + s_SPtrans[0][i] = c_SPtrans[0][i]; + s_SPtrans[1][i] = c_SPtrans[1][i]; + s_SPtrans[2][i] = c_SPtrans[2][i]; + s_SPtrans[3][i] = c_SPtrans[3][i]; + s_SPtrans[4][i] = c_SPtrans[4][i]; + s_SPtrans[5][i] = c_SPtrans[5][i]; + s_SPtrans[6][i] = c_SPtrans[6][i]; + s_SPtrans[7][i] = c_SPtrans[7][i]; + + s_skb[0][i] = c_skb[0][i]; + s_skb[1][i] = c_skb[1][i]; + s_skb[2][i] = c_skb[2][i]; + s_skb[3][i] = c_skb[3][i]; + s_skb[4][i] = c_skb[4][i]; + s_skb[5][i] = c_skb[5][i]; + s_skb[6][i] = c_skb[6][i]; + s_skb[7][i] = c_skb[7][i]; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * salt + */ + + const u32 s0 = salt_bufs[salt_pos].salt_buf[0]; + const u32 s1 = salt_bufs[salt_pos].salt_buf[1]; + const u32 s2 = salt_bufs[salt_pos].salt_buf[2]; + + /** + * base + */ + + md4_ctx_t ctx0; + + md4_init (&ctx0); + + md4_update_global_utf16le (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md4_ctx_t ctx = ctx0; + + md4_update_global_utf16le (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md4_final (&ctx); + + const u32 a = ctx.h[0]; + const u32 b = ctx.h[1]; + const u32 c = ctx.h[2]; + const u32 d = ctx.h[3]; + + if ((d >> 16) != s2) continue; + + /** + * DES1 + */ + + u32 key[2]; + + transform_netntlmv1_key (a, b, key); + + u32 Kc[16]; + u32 Kd[16]; + + _des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb); + + u32 data[2]; + + data[0] = s0; + data[1] = s1; + + u32 out1[2]; + + _des_crypt_encrypt (out1, data, Kc, Kd, s_SPtrans); + + /** + * DES2 + */ + + /* + transform_netntlmv1_key (((b >> 24) | (c << 8)), ((c >> 24) | (d << 8)), key); + + _des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb); + + u32 out2[2]; + + _des_crypt_encrypt (out2, data, Kc, Kd, s_SPtrans); + */ + + const u32 r0 = out1[0]; + const u32 r1 = out1[1]; + const u32 r2 = search[2]; + const u32 r3 = search[3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m05500_a3.cl b/OpenCL/m05500_a3.cl new file mode 100644 index 000000000..261e49457 --- /dev/null +++ b/OpenCL/m05500_a3.cl @@ -0,0 +1,780 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_md4.cl" + +#define PERM_OP(a,b,tt,n,m) \ +{ \ + tt = a >> n; \ + tt = tt ^ b; \ + tt = tt & m; \ + b = b ^ tt; \ + tt = tt << n; \ + a = a ^ tt; \ +} + +#define HPERM_OP(a,tt,n,m) \ +{ \ + tt = a << (16 + n); \ + tt = tt ^ a; \ + tt = tt & m; \ + a = a ^ tt; \ + tt = tt >> (16 + n); \ + a = a ^ tt; \ +} + +__constant u32a c_SPtrans[8][64] = +{ + { + 0x02080800, 0x00080000, 0x02000002, 0x02080802, + 0x02000000, 0x00080802, 0x00080002, 0x02000002, + 0x00080802, 0x02080800, 0x02080000, 0x00000802, + 0x02000802, 0x02000000, 0x00000000, 0x00080002, + 0x00080000, 0x00000002, 0x02000800, 0x00080800, + 0x02080802, 0x02080000, 0x00000802, 0x02000800, + 0x00000002, 0x00000800, 0x00080800, 0x02080002, + 0x00000800, 0x02000802, 0x02080002, 0x00000000, + 0x00000000, 0x02080802, 0x02000800, 0x00080002, + 0x02080800, 0x00080000, 0x00000802, 0x02000800, + 0x02080002, 0x00000800, 0x00080800, 0x02000002, + 0x00080802, 0x00000002, 0x02000002, 0x02080000, + 0x02080802, 0x00080800, 0x02080000, 0x02000802, + 0x02000000, 0x00000802, 0x00080002, 0x00000000, + 0x00080000, 0x02000000, 0x02000802, 0x02080800, + 0x00000002, 0x02080002, 0x00000800, 0x00080802, + }, + { + 0x40108010, 0x00000000, 0x00108000, 0x40100000, + 0x40000010, 0x00008010, 0x40008000, 0x00108000, + 0x00008000, 0x40100010, 0x00000010, 0x40008000, + 0x00100010, 0x40108000, 0x40100000, 0x00000010, + 0x00100000, 0x40008010, 0x40100010, 0x00008000, + 0x00108010, 0x40000000, 0x00000000, 0x00100010, + 0x40008010, 0x00108010, 0x40108000, 0x40000010, + 0x40000000, 0x00100000, 0x00008010, 0x40108010, + 0x00100010, 0x40108000, 0x40008000, 0x00108010, + 0x40108010, 0x00100010, 0x40000010, 0x00000000, + 0x40000000, 0x00008010, 0x00100000, 0x40100010, + 0x00008000, 0x40000000, 0x00108010, 0x40008010, + 0x40108000, 0x00008000, 0x00000000, 0x40000010, + 0x00000010, 0x40108010, 0x00108000, 0x40100000, + 0x40100010, 0x00100000, 0x00008010, 0x40008000, + 0x40008010, 0x00000010, 0x40100000, 0x00108000, + }, + { + 0x04000001, 0x04040100, 0x00000100, 0x04000101, + 0x00040001, 0x04000000, 0x04000101, 0x00040100, + 0x04000100, 0x00040000, 0x04040000, 0x00000001, + 0x04040101, 0x00000101, 0x00000001, 0x04040001, + 0x00000000, 0x00040001, 0x04040100, 0x00000100, + 0x00000101, 0x04040101, 0x00040000, 0x04000001, + 0x04040001, 0x04000100, 0x00040101, 0x04040000, + 0x00040100, 0x00000000, 0x04000000, 0x00040101, + 0x04040100, 0x00000100, 0x00000001, 0x00040000, + 0x00000101, 0x00040001, 0x04040000, 0x04000101, + 0x00000000, 0x04040100, 0x00040100, 0x04040001, + 0x00040001, 0x04000000, 0x04040101, 0x00000001, + 0x00040101, 0x04000001, 0x04000000, 0x04040101, + 0x00040000, 0x04000100, 0x04000101, 0x00040100, + 0x04000100, 0x00000000, 0x04040001, 0x00000101, + 0x04000001, 0x00040101, 0x00000100, 0x04040000, + }, + { + 0x00401008, 0x10001000, 0x00000008, 0x10401008, + 0x00000000, 0x10400000, 0x10001008, 0x00400008, + 0x10401000, 0x10000008, 0x10000000, 0x00001008, + 0x10000008, 0x00401008, 0x00400000, 0x10000000, + 0x10400008, 0x00401000, 0x00001000, 0x00000008, + 0x00401000, 0x10001008, 0x10400000, 0x00001000, + 0x00001008, 0x00000000, 0x00400008, 0x10401000, + 0x10001000, 0x10400008, 0x10401008, 0x00400000, + 0x10400008, 0x00001008, 0x00400000, 0x10000008, + 0x00401000, 0x10001000, 0x00000008, 0x10400000, + 0x10001008, 0x00000000, 0x00001000, 0x00400008, + 0x00000000, 0x10400008, 0x10401000, 0x00001000, + 0x10000000, 0x10401008, 0x00401008, 0x00400000, + 0x10401008, 0x00000008, 0x10001000, 0x00401008, + 0x00400008, 0x00401000, 0x10400000, 0x10001008, + 0x00001008, 0x10000000, 0x10000008, 0x10401000, + }, + { + 0x08000000, 0x00010000, 0x00000400, 0x08010420, + 0x08010020, 0x08000400, 0x00010420, 0x08010000, + 0x00010000, 0x00000020, 0x08000020, 0x00010400, + 0x08000420, 0x08010020, 0x08010400, 0x00000000, + 0x00010400, 0x08000000, 0x00010020, 0x00000420, + 0x08000400, 0x00010420, 0x00000000, 0x08000020, + 0x00000020, 0x08000420, 0x08010420, 0x00010020, + 0x08010000, 0x00000400, 0x00000420, 0x08010400, + 0x08010400, 0x08000420, 0x00010020, 0x08010000, + 0x00010000, 0x00000020, 0x08000020, 0x08000400, + 0x08000000, 0x00010400, 0x08010420, 0x00000000, + 0x00010420, 0x08000000, 0x00000400, 0x00010020, + 0x08000420, 0x00000400, 0x00000000, 0x08010420, + 0x08010020, 0x08010400, 0x00000420, 0x00010000, + 0x00010400, 0x08010020, 0x08000400, 0x00000420, + 0x00000020, 0x00010420, 0x08010000, 0x08000020, + }, + { + 0x80000040, 0x00200040, 0x00000000, 0x80202000, + 0x00200040, 0x00002000, 0x80002040, 0x00200000, + 0x00002040, 0x80202040, 0x00202000, 0x80000000, + 0x80002000, 0x80000040, 0x80200000, 0x00202040, + 0x00200000, 0x80002040, 0x80200040, 0x00000000, + 0x00002000, 0x00000040, 0x80202000, 0x80200040, + 0x80202040, 0x80200000, 0x80000000, 0x00002040, + 0x00000040, 0x00202000, 0x00202040, 0x80002000, + 0x00002040, 0x80000000, 0x80002000, 0x00202040, + 0x80202000, 0x00200040, 0x00000000, 0x80002000, + 0x80000000, 0x00002000, 0x80200040, 0x00200000, + 0x00200040, 0x80202040, 0x00202000, 0x00000040, + 0x80202040, 0x00202000, 0x00200000, 0x80002040, + 0x80000040, 0x80200000, 0x00202040, 0x00000000, + 0x00002000, 0x80000040, 0x80002040, 0x80202000, + 0x80200000, 0x00002040, 0x00000040, 0x80200040, + }, + { + 0x00004000, 0x00000200, 0x01000200, 0x01000004, + 0x01004204, 0x00004004, 0x00004200, 0x00000000, + 0x01000000, 0x01000204, 0x00000204, 0x01004000, + 0x00000004, 0x01004200, 0x01004000, 0x00000204, + 0x01000204, 0x00004000, 0x00004004, 0x01004204, + 0x00000000, 0x01000200, 0x01000004, 0x00004200, + 0x01004004, 0x00004204, 0x01004200, 0x00000004, + 0x00004204, 0x01004004, 0x00000200, 0x01000000, + 0x00004204, 0x01004000, 0x01004004, 0x00000204, + 0x00004000, 0x00000200, 0x01000000, 0x01004004, + 0x01000204, 0x00004204, 0x00004200, 0x00000000, + 0x00000200, 0x01000004, 0x00000004, 0x01000200, + 0x00000000, 0x01000204, 0x01000200, 0x00004200, + 0x00000204, 0x00004000, 0x01004204, 0x01000000, + 0x01004200, 0x00000004, 0x00004004, 0x01004204, + 0x01000004, 0x01004200, 0x01004000, 0x00004004, + }, + { + 0x20800080, 0x20820000, 0x00020080, 0x00000000, + 0x20020000, 0x00800080, 0x20800000, 0x20820080, + 0x00000080, 0x20000000, 0x00820000, 0x00020080, + 0x00820080, 0x20020080, 0x20000080, 0x20800000, + 0x00020000, 0x00820080, 0x00800080, 0x20020000, + 0x20820080, 0x20000080, 0x00000000, 0x00820000, + 0x20000000, 0x00800000, 0x20020080, 0x20800080, + 0x00800000, 0x00020000, 0x20820000, 0x00000080, + 0x00800000, 0x00020000, 0x20000080, 0x20820080, + 0x00020080, 0x20000000, 0x00000000, 0x00820000, + 0x20800080, 0x20020080, 0x20020000, 0x00800080, + 0x20820000, 0x00000080, 0x00800080, 0x20020000, + 0x20820080, 0x00800000, 0x20800000, 0x20000080, + 0x00820000, 0x00020080, 0x20020080, 0x20800000, + 0x00000080, 0x20820000, 0x00820080, 0x00000000, + 0x20000000, 0x20800080, 0x00020000, 0x00820080, + } +}; + +__constant u32a c_skb[8][64] = +{ + { + 0x00000000, 0x00000010, 0x20000000, 0x20000010, + 0x00010000, 0x00010010, 0x20010000, 0x20010010, + 0x00000800, 0x00000810, 0x20000800, 0x20000810, + 0x00010800, 0x00010810, 0x20010800, 0x20010810, + 0x00000020, 0x00000030, 0x20000020, 0x20000030, + 0x00010020, 0x00010030, 0x20010020, 0x20010030, + 0x00000820, 0x00000830, 0x20000820, 0x20000830, + 0x00010820, 0x00010830, 0x20010820, 0x20010830, + 0x00080000, 0x00080010, 0x20080000, 0x20080010, + 0x00090000, 0x00090010, 0x20090000, 0x20090010, + 0x00080800, 0x00080810, 0x20080800, 0x20080810, + 0x00090800, 0x00090810, 0x20090800, 0x20090810, + 0x00080020, 0x00080030, 0x20080020, 0x20080030, + 0x00090020, 0x00090030, 0x20090020, 0x20090030, + 0x00080820, 0x00080830, 0x20080820, 0x20080830, + 0x00090820, 0x00090830, 0x20090820, 0x20090830, + }, + { + 0x00000000, 0x02000000, 0x00002000, 0x02002000, + 0x00200000, 0x02200000, 0x00202000, 0x02202000, + 0x00000004, 0x02000004, 0x00002004, 0x02002004, + 0x00200004, 0x02200004, 0x00202004, 0x02202004, + 0x00000400, 0x02000400, 0x00002400, 0x02002400, + 0x00200400, 0x02200400, 0x00202400, 0x02202400, + 0x00000404, 0x02000404, 0x00002404, 0x02002404, + 0x00200404, 0x02200404, 0x00202404, 0x02202404, + 0x10000000, 0x12000000, 0x10002000, 0x12002000, + 0x10200000, 0x12200000, 0x10202000, 0x12202000, + 0x10000004, 0x12000004, 0x10002004, 0x12002004, + 0x10200004, 0x12200004, 0x10202004, 0x12202004, + 0x10000400, 0x12000400, 0x10002400, 0x12002400, + 0x10200400, 0x12200400, 0x10202400, 0x12202400, + 0x10000404, 0x12000404, 0x10002404, 0x12002404, + 0x10200404, 0x12200404, 0x10202404, 0x12202404, + }, + { + 0x00000000, 0x00000001, 0x00040000, 0x00040001, + 0x01000000, 0x01000001, 0x01040000, 0x01040001, + 0x00000002, 0x00000003, 0x00040002, 0x00040003, + 0x01000002, 0x01000003, 0x01040002, 0x01040003, + 0x00000200, 0x00000201, 0x00040200, 0x00040201, + 0x01000200, 0x01000201, 0x01040200, 0x01040201, + 0x00000202, 0x00000203, 0x00040202, 0x00040203, + 0x01000202, 0x01000203, 0x01040202, 0x01040203, + 0x08000000, 0x08000001, 0x08040000, 0x08040001, + 0x09000000, 0x09000001, 0x09040000, 0x09040001, + 0x08000002, 0x08000003, 0x08040002, 0x08040003, + 0x09000002, 0x09000003, 0x09040002, 0x09040003, + 0x08000200, 0x08000201, 0x08040200, 0x08040201, + 0x09000200, 0x09000201, 0x09040200, 0x09040201, + 0x08000202, 0x08000203, 0x08040202, 0x08040203, + 0x09000202, 0x09000203, 0x09040202, 0x09040203, + }, + { + 0x00000000, 0x00100000, 0x00000100, 0x00100100, + 0x00000008, 0x00100008, 0x00000108, 0x00100108, + 0x00001000, 0x00101000, 0x00001100, 0x00101100, + 0x00001008, 0x00101008, 0x00001108, 0x00101108, + 0x04000000, 0x04100000, 0x04000100, 0x04100100, + 0x04000008, 0x04100008, 0x04000108, 0x04100108, + 0x04001000, 0x04101000, 0x04001100, 0x04101100, + 0x04001008, 0x04101008, 0x04001108, 0x04101108, + 0x00020000, 0x00120000, 0x00020100, 0x00120100, + 0x00020008, 0x00120008, 0x00020108, 0x00120108, + 0x00021000, 0x00121000, 0x00021100, 0x00121100, + 0x00021008, 0x00121008, 0x00021108, 0x00121108, + 0x04020000, 0x04120000, 0x04020100, 0x04120100, + 0x04020008, 0x04120008, 0x04020108, 0x04120108, + 0x04021000, 0x04121000, 0x04021100, 0x04121100, + 0x04021008, 0x04121008, 0x04021108, 0x04121108, + }, + { + 0x00000000, 0x10000000, 0x00010000, 0x10010000, + 0x00000004, 0x10000004, 0x00010004, 0x10010004, + 0x20000000, 0x30000000, 0x20010000, 0x30010000, + 0x20000004, 0x30000004, 0x20010004, 0x30010004, + 0x00100000, 0x10100000, 0x00110000, 0x10110000, + 0x00100004, 0x10100004, 0x00110004, 0x10110004, + 0x20100000, 0x30100000, 0x20110000, 0x30110000, + 0x20100004, 0x30100004, 0x20110004, 0x30110004, + 0x00001000, 0x10001000, 0x00011000, 0x10011000, + 0x00001004, 0x10001004, 0x00011004, 0x10011004, + 0x20001000, 0x30001000, 0x20011000, 0x30011000, + 0x20001004, 0x30001004, 0x20011004, 0x30011004, + 0x00101000, 0x10101000, 0x00111000, 0x10111000, + 0x00101004, 0x10101004, 0x00111004, 0x10111004, + 0x20101000, 0x30101000, 0x20111000, 0x30111000, + 0x20101004, 0x30101004, 0x20111004, 0x30111004, + }, + { + 0x00000000, 0x08000000, 0x00000008, 0x08000008, + 0x00000400, 0x08000400, 0x00000408, 0x08000408, + 0x00020000, 0x08020000, 0x00020008, 0x08020008, + 0x00020400, 0x08020400, 0x00020408, 0x08020408, + 0x00000001, 0x08000001, 0x00000009, 0x08000009, + 0x00000401, 0x08000401, 0x00000409, 0x08000409, + 0x00020001, 0x08020001, 0x00020009, 0x08020009, + 0x00020401, 0x08020401, 0x00020409, 0x08020409, + 0x02000000, 0x0A000000, 0x02000008, 0x0A000008, + 0x02000400, 0x0A000400, 0x02000408, 0x0A000408, + 0x02020000, 0x0A020000, 0x02020008, 0x0A020008, + 0x02020400, 0x0A020400, 0x02020408, 0x0A020408, + 0x02000001, 0x0A000001, 0x02000009, 0x0A000009, + 0x02000401, 0x0A000401, 0x02000409, 0x0A000409, + 0x02020001, 0x0A020001, 0x02020009, 0x0A020009, + 0x02020401, 0x0A020401, 0x02020409, 0x0A020409, + }, + { + 0x00000000, 0x00000100, 0x00080000, 0x00080100, + 0x01000000, 0x01000100, 0x01080000, 0x01080100, + 0x00000010, 0x00000110, 0x00080010, 0x00080110, + 0x01000010, 0x01000110, 0x01080010, 0x01080110, + 0x00200000, 0x00200100, 0x00280000, 0x00280100, + 0x01200000, 0x01200100, 0x01280000, 0x01280100, + 0x00200010, 0x00200110, 0x00280010, 0x00280110, + 0x01200010, 0x01200110, 0x01280010, 0x01280110, + 0x00000200, 0x00000300, 0x00080200, 0x00080300, + 0x01000200, 0x01000300, 0x01080200, 0x01080300, + 0x00000210, 0x00000310, 0x00080210, 0x00080310, + 0x01000210, 0x01000310, 0x01080210, 0x01080310, + 0x00200200, 0x00200300, 0x00280200, 0x00280300, + 0x01200200, 0x01200300, 0x01280200, 0x01280300, + 0x00200210, 0x00200310, 0x00280210, 0x00280310, + 0x01200210, 0x01200310, 0x01280210, 0x01280310, + }, + { + 0x00000000, 0x04000000, 0x00040000, 0x04040000, + 0x00000002, 0x04000002, 0x00040002, 0x04040002, + 0x00002000, 0x04002000, 0x00042000, 0x04042000, + 0x00002002, 0x04002002, 0x00042002, 0x04042002, + 0x00000020, 0x04000020, 0x00040020, 0x04040020, + 0x00000022, 0x04000022, 0x00040022, 0x04040022, + 0x00002020, 0x04002020, 0x00042020, 0x04042020, + 0x00002022, 0x04002022, 0x00042022, 0x04042022, + 0x00000800, 0x04000800, 0x00040800, 0x04040800, + 0x00000802, 0x04000802, 0x00040802, 0x04040802, + 0x00002800, 0x04002800, 0x00042800, 0x04042800, + 0x00002802, 0x04002802, 0x00042802, 0x04042802, + 0x00000820, 0x04000820, 0x00040820, 0x04040820, + 0x00000822, 0x04000822, 0x00040822, 0x04040822, + 0x00002820, 0x04002820, 0x00042820, 0x04042820, + 0x00002822, 0x04002822, 0x00042822, 0x04042822 + } +}; + +#if VECT_SIZE == 1 +#define BOX(i,n,S) (S)[(n)][(i)] +#elif VECT_SIZE == 2 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1]) +#elif VECT_SIZE == 4 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3]) +#elif VECT_SIZE == 8 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7]) +#elif VECT_SIZE == 16 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf]) +#endif + +void _des_crypt_encrypt (u32x iv[2], u32x data[2], u32x Kc[16], u32x Kd[16], __local u32 (*s_SPtrans)[64]) +{ + u32x r = data[0]; + u32x l = data[1]; + + #ifdef _unroll + #pragma unroll + #endif + for (u32 i = 0; i < 16; i += 2) + { + u32x u; + u32x t; + + u = Kc[i + 0] ^ rotl32 (r, 30u); + t = Kd[i + 0] ^ rotl32 (r, 26u); + + l ^= BOX (((u >> 0) & 0x3f), 0, s_SPtrans) + | BOX (((u >> 8) & 0x3f), 2, s_SPtrans) + | BOX (((u >> 16) & 0x3f), 4, s_SPtrans) + | BOX (((u >> 24) & 0x3f), 6, s_SPtrans) + | BOX (((t >> 0) & 0x3f), 1, s_SPtrans) + | BOX (((t >> 8) & 0x3f), 3, s_SPtrans) + | BOX (((t >> 16) & 0x3f), 5, s_SPtrans) + | BOX (((t >> 24) & 0x3f), 7, s_SPtrans); + + u = Kc[i + 1] ^ rotl32 (l, 30u); + t = Kd[i + 1] ^ rotl32 (l, 26u); + + r ^= BOX (((u >> 0) & 0x3f), 0, s_SPtrans) + | BOX (((u >> 8) & 0x3f), 2, s_SPtrans) + | BOX (((u >> 16) & 0x3f), 4, s_SPtrans) + | BOX (((u >> 24) & 0x3f), 6, s_SPtrans) + | BOX (((t >> 0) & 0x3f), 1, s_SPtrans) + | BOX (((t >> 8) & 0x3f), 3, s_SPtrans) + | BOX (((t >> 16) & 0x3f), 5, s_SPtrans) + | BOX (((t >> 24) & 0x3f), 7, s_SPtrans); + } + + iv[0] = l; + iv[1] = r; +} + +void _des_crypt_keysetup (u32x c, u32x d, u32x Kc[16], u32x Kd[16], __local u32 (*s_skb)[64]) +{ + u32x tt; + + PERM_OP (d, c, tt, 4, 0x0f0f0f0f); + HPERM_OP (c, tt, 2, 0xcccc0000); + HPERM_OP (d, tt, 2, 0xcccc0000); + PERM_OP (d, c, tt, 1, 0x55555555); + PERM_OP (c, d, tt, 8, 0x00ff00ff); + PERM_OP (d, c, tt, 1, 0x55555555); + + d = ((d & 0x000000ff) << 16) + | ((d & 0x0000ff00) << 0) + | ((d & 0x00ff0000) >> 16) + | ((c & 0xf0000000) >> 4); + + c = c & 0x0fffffff; + + #ifdef _unroll + #pragma unroll + #endif + for (u32 i = 0; i < 16; i++) + { + if ((i < 2) || (i == 8) || (i == 15)) + { + c = ((c >> 1) | (c << 27)); + d = ((d >> 1) | (d << 27)); + } + else + { + c = ((c >> 2) | (c << 26)); + d = ((d >> 2) | (d << 26)); + } + + c = c & 0x0fffffff; + d = d & 0x0fffffff; + + const u32x c00 = (c >> 0) & 0x0000003f; + const u32x c06 = (c >> 6) & 0x00383003; + const u32x c07 = (c >> 7) & 0x0000003c; + const u32x c13 = (c >> 13) & 0x0000060f; + const u32x c20 = (c >> 20) & 0x00000001; + + u32x s = BOX (((c00 >> 0) & 0xff), 0, s_skb) + | BOX (((c06 >> 0) & 0xff) + |((c07 >> 0) & 0xff), 1, s_skb) + | BOX (((c13 >> 0) & 0xff) + |((c06 >> 8) & 0xff), 2, s_skb) + | BOX (((c20 >> 0) & 0xff) + |((c13 >> 8) & 0xff) + |((c06 >> 16) & 0xff), 3, s_skb); + + const u32x d00 = (d >> 0) & 0x00003c3f; + const u32x d07 = (d >> 7) & 0x00003f03; + const u32x d21 = (d >> 21) & 0x0000000f; + const u32x d22 = (d >> 22) & 0x00000030; + + u32x t = BOX (((d00 >> 0) & 0xff), 4, s_skb) + | BOX (((d07 >> 0) & 0xff) + |((d00 >> 8) & 0xff), 5, s_skb) + | BOX (((d07 >> 8) & 0xff), 6, s_skb) + | BOX (((d21 >> 0) & 0xff) + |((d22 >> 0) & 0xff), 7, s_skb); + + Kc[i] = ((t << 16) | (s & 0x0000ffff)); + Kd[i] = ((s >> 16) | (t & 0xffff0000)); + } +} + +void transform_netntlmv1_key (const u32x w0, const u32x w1, u32x out[2]) +{ + u32x t[8]; + + t[0] = (w0 >> 0) & 0xff; + t[1] = (w0 >> 8) & 0xff; + t[2] = (w0 >> 16) & 0xff; + t[3] = (w0 >> 24) & 0xff; + t[4] = (w1 >> 0) & 0xff; + t[5] = (w1 >> 8) & 0xff; + t[6] = (w1 >> 16) & 0xff; + t[7] = (w1 >> 24) & 0xff; + + u32x k[8]; + + k[0] = (t[0] >> 0); + k[1] = (t[0] << 7) | (t[1] >> 1); + k[2] = (t[1] << 6) | (t[2] >> 2); + k[3] = (t[2] << 5) | (t[3] >> 3); + k[4] = (t[3] << 4) | (t[4] >> 4); + k[5] = (t[4] << 3) | (t[5] >> 5); + k[6] = (t[5] << 2) | (t[6] >> 6); + k[7] = (t[6] << 1); + + out[0] = ((k[0] & 0xff) << 0) + | ((k[1] & 0xff) << 8) + | ((k[2] & 0xff) << 16) + | ((k[3] & 0xff) << 24); + + out[1] = ((k[4] & 0xff) << 0) + | ((k[5] & 0xff) << 8) + | ((k[6] & 0xff) << 16) + | ((k[7] & 0xff) << 24); +} + +__kernel void m05500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * sbox, kbox + */ + + __local u32 s_SPtrans[8][64]; + __local u32 s_skb[8][64]; + + for (u32 i = lid; i < 64; i += lsz) + { + s_SPtrans[0][i] = c_SPtrans[0][i]; + s_SPtrans[1][i] = c_SPtrans[1][i]; + s_SPtrans[2][i] = c_SPtrans[2][i]; + s_SPtrans[3][i] = c_SPtrans[3][i]; + s_SPtrans[4][i] = c_SPtrans[4][i]; + s_SPtrans[5][i] = c_SPtrans[5][i]; + s_SPtrans[6][i] = c_SPtrans[6][i]; + s_SPtrans[7][i] = c_SPtrans[7][i]; + + s_skb[0][i] = c_skb[0][i]; + s_skb[1][i] = c_skb[1][i]; + s_skb[2][i] = c_skb[2][i]; + s_skb[3][i] = c_skb[3][i]; + s_skb[4][i] = c_skb[4][i]; + s_skb[5][i] = c_skb[5][i]; + s_skb[6][i] = c_skb[6][i]; + s_skb[7][i] = c_skb[7][i]; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * salt + */ + + const u32 s0 = salt_bufs[salt_pos].salt_buf[0]; + const u32 s1 = salt_bufs[salt_pos].salt_buf[1]; + const u32 s2 = salt_bufs[salt_pos].salt_buf[2]; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md4_ctx_vector_t ctx; + + md4_init_vector (&ctx); + + md4_update_vector_utf16le (&ctx, w, pw_len); + + md4_final_vector (&ctx); + + const u32x a = ctx.h[0]; + const u32x b = ctx.h[1]; + const u32x c = ctx.h[2]; + const u32x d = ctx.h[3]; + + if (MATCHES_NONE_VS ((d >> 16), s2)) continue; + + /** + * DES1 + */ + + u32x key[2]; + + transform_netntlmv1_key (a, b, key); + + u32x Kc[16]; + u32x Kd[16]; + + _des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb); + + u32x data[2]; + + data[0] = s0; + data[1] = s1; + + u32x out1[2]; + + _des_crypt_encrypt (out1, data, Kc, Kd, s_SPtrans); + + /** + * DES2 + */ + + transform_netntlmv1_key (((b >> 24) | (c << 8)), ((c >> 24) | (d << 8)), key); + + _des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb); + + u32x out2[2]; + + _des_crypt_encrypt (out2, data, Kc, Kd, s_SPtrans); + + const u32x r0 = out1[0]; + const u32x r1 = out1[1]; + const u32x r2 = out2[0]; + const u32x r3 = out2[1]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m05500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * sbox, kbox + */ + + __local u32 s_SPtrans[8][64]; + __local u32 s_skb[8][64]; + + for (u32 i = lid; i < 64; i += lsz) + { + s_SPtrans[0][i] = c_SPtrans[0][i]; + s_SPtrans[1][i] = c_SPtrans[1][i]; + s_SPtrans[2][i] = c_SPtrans[2][i]; + s_SPtrans[3][i] = c_SPtrans[3][i]; + s_SPtrans[4][i] = c_SPtrans[4][i]; + s_SPtrans[5][i] = c_SPtrans[5][i]; + s_SPtrans[6][i] = c_SPtrans[6][i]; + s_SPtrans[7][i] = c_SPtrans[7][i]; + + s_skb[0][i] = c_skb[0][i]; + s_skb[1][i] = c_skb[1][i]; + s_skb[2][i] = c_skb[2][i]; + s_skb[3][i] = c_skb[3][i]; + s_skb[4][i] = c_skb[4][i]; + s_skb[5][i] = c_skb[5][i]; + s_skb[6][i] = c_skb[6][i]; + s_skb[7][i] = c_skb[7][i]; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * salt + */ + + const u32 s0 = salt_bufs[salt_pos].salt_buf[0]; + const u32 s1 = salt_bufs[salt_pos].salt_buf[1]; + const u32 s2 = salt_bufs[salt_pos].salt_buf[2]; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md4_ctx_vector_t ctx; + + md4_init_vector (&ctx); + + md4_update_vector_utf16le (&ctx, w, pw_len); + + md4_final_vector (&ctx); + + const u32x a = ctx.h[0]; + const u32x b = ctx.h[1]; + const u32x c = ctx.h[2]; + const u32x d = ctx.h[3]; + + if (MATCHES_NONE_VS ((d >> 16), s2)) continue; + + /** + * DES1 + */ + + u32x key[2]; + + transform_netntlmv1_key (a, b, key); + + u32x Kc[16]; + u32x Kd[16]; + + _des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb); + + u32x data[2]; + + data[0] = s0; + data[1] = s1; + + u32x out1[2]; + + _des_crypt_encrypt (out1, data, Kc, Kd, s_SPtrans); + + /** + * DES2 + */ + + /* + transform_netntlmv1_key (((b >> 24) | (c << 8)), ((c >> 24) | (d << 8)), key); + + _des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb); + + u32x out2[2]; + + _des_crypt_encrypt (out2, data, Kc, Kd, s_SPtrans); + */ + + const u32x r0 = out1[0]; + const u32x r1 = out1[1]; + const u32x r2 = search[2]; + const u32x r3 = search[3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} From 443fa960d3257ce9e16eac9535bfda894f665697 Mon Sep 17 00:00:00 2001 From: jsteube Date: Mon, 31 Jul 2017 15:28:22 +0200 Subject: [PATCH 31/75] Fix vector function calls --- OpenCL/m05300_a0.cl | 4 ++-- OpenCL/m05300_a1.cl | 12 ++++++------ OpenCL/m05300_a3.cl | 4 ++-- OpenCL/m05400_a0.cl | 4 ++-- OpenCL/m05400_a1.cl | 4 ++-- OpenCL/m05400_a3.cl | 4 ++-- 6 files changed, 16 insertions(+), 16 deletions(-) diff --git a/OpenCL/m05300_a0.cl b/OpenCL/m05300_a0.cl index 5bcee3ed5..6c17f8865 100644 --- a/OpenCL/m05300_a0.cl +++ b/OpenCL/m05300_a0.cl @@ -83,7 +83,7 @@ __kernel void m05300_mxx (__global pw_t *pws, __global const kernel_rule_t *rule md5_hmac_ctx_t ctx; - md5_hmac_init_vector_64 (&ctx, w0, w1, w2, w3); + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); md5_hmac_update_global (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); @@ -178,7 +178,7 @@ __kernel void m05300_sxx (__global pw_t *pws, __global const kernel_rule_t *rule md5_hmac_ctx_t ctx; - md5_hmac_init_vector_64 (&ctx, w0, w1, w2, w3); + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); md5_hmac_update_global (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); diff --git a/OpenCL/m05300_a1.cl b/OpenCL/m05300_a1.cl index 141e20207..aedad50e9 100644 --- a/OpenCL/m05300_a1.cl +++ b/OpenCL/m05300_a1.cl @@ -69,9 +69,9 @@ __kernel void m05300_mxx (__global pw_t *pws, __global const kernel_rule_t *rule c[i] |= w[i]; } - md5_hmac_ctx_vector_t ctx0; + md5_hmac_ctx_t ctx0; - md5_hmac_init_vector (&ctx0, c, pw_len + comb_len); + md5_hmac_init (&ctx0, c, pw_len + comb_len); md5_hmac_update_global (&ctx0, ikepsk_bufs[digests_offset].nr_buf, ikepsk_bufs[digests_offset].nr_len); @@ -101,7 +101,7 @@ __kernel void m05300_mxx (__global pw_t *pws, __global const kernel_rule_t *rule md5_hmac_ctx_t ctx; - md5_hmac_init_vector_64 (&ctx, w0, w1, w2, w3); + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); md5_hmac_update_global (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); @@ -184,9 +184,9 @@ __kernel void m05300_sxx (__global pw_t *pws, __global const kernel_rule_t *rule c[i] |= w[i]; } - md5_hmac_ctx_vector_t ctx0; + md5_hmac_ctx_t ctx0; - md5_hmac_init_vector (&ctx0, c, pw_len + comb_len); + md5_hmac_init (&ctx0, c, pw_len + comb_len); md5_hmac_update_global (&ctx0, ikepsk_bufs[digests_offset].nr_buf, ikepsk_bufs[digests_offset].nr_len); @@ -216,7 +216,7 @@ __kernel void m05300_sxx (__global pw_t *pws, __global const kernel_rule_t *rule md5_hmac_ctx_t ctx; - md5_hmac_init_vector_64 (&ctx, w0, w1, w2, w3); + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); md5_hmac_update_global (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); diff --git a/OpenCL/m05300_a3.cl b/OpenCL/m05300_a3.cl index 29e83a70d..e42db4387 100644 --- a/OpenCL/m05300_a3.cl +++ b/OpenCL/m05300_a3.cl @@ -87,7 +87,7 @@ __kernel void m05300_mxx (__global pw_t *pws, __global const kernel_rule_t *rule md5_hmac_ctx_t ctx; - md5_hmac_init_vector_64 (&ctx, w0, w1, w2, w3); + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); md5_hmac_update_global (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); @@ -188,7 +188,7 @@ __kernel void m05300_sxx (__global pw_t *pws, __global const kernel_rule_t *rule md5_hmac_ctx_t ctx; - md5_hmac_init_vector_64 (&ctx, w0, w1, w2, w3); + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); md5_hmac_update_global (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); diff --git a/OpenCL/m05400_a0.cl b/OpenCL/m05400_a0.cl index ccafef2f5..34b78379f 100644 --- a/OpenCL/m05400_a0.cl +++ b/OpenCL/m05400_a0.cl @@ -83,7 +83,7 @@ __kernel void m05400_mxx (__global pw_t *pws, __global const kernel_rule_t *rule sha1_hmac_ctx_t ctx; - sha1_hmac_init_vector_64 (&ctx, w0, w1, w2, w3); + sha1_hmac_init_64 (&ctx, w0, w1, w2, w3); sha1_hmac_update_global_swap (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); @@ -178,7 +178,7 @@ __kernel void m05400_sxx (__global pw_t *pws, __global const kernel_rule_t *rule sha1_hmac_ctx_t ctx; - sha1_hmac_init_vector_64 (&ctx, w0, w1, w2, w3); + sha1_hmac_init_64 (&ctx, w0, w1, w2, w3); sha1_hmac_update_global_swap (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); diff --git a/OpenCL/m05400_a1.cl b/OpenCL/m05400_a1.cl index 415d537d1..74bd00d06 100644 --- a/OpenCL/m05400_a1.cl +++ b/OpenCL/m05400_a1.cl @@ -101,7 +101,7 @@ __kernel void m05400_mxx (__global pw_t *pws, __global const kernel_rule_t *rule sha1_hmac_ctx_t ctx; - sha1_hmac_init_vector_64 (&ctx, w0, w1, w2, w3); + sha1_hmac_init_64 (&ctx, w0, w1, w2, w3); sha1_hmac_update_global_swap (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); @@ -216,7 +216,7 @@ __kernel void m05400_sxx (__global pw_t *pws, __global const kernel_rule_t *rule sha1_hmac_ctx_t ctx; - sha1_hmac_init_vector_64 (&ctx, w0, w1, w2, w3); + sha1_hmac_init_64 (&ctx, w0, w1, w2, w3); sha1_hmac_update_global_swap (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); diff --git a/OpenCL/m05400_a3.cl b/OpenCL/m05400_a3.cl index ee4ab2b53..92c014049 100644 --- a/OpenCL/m05400_a3.cl +++ b/OpenCL/m05400_a3.cl @@ -87,7 +87,7 @@ __kernel void m05400_mxx (__global pw_t *pws, __global const kernel_rule_t *rule sha1_hmac_ctx_t ctx; - sha1_hmac_init_vector_64 (&ctx, w0, w1, w2, w3); + sha1_hmac_init_64 (&ctx, w0, w1, w2, w3); sha1_hmac_update_global_swap (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); @@ -188,7 +188,7 @@ __kernel void m05400_sxx (__global pw_t *pws, __global const kernel_rule_t *rule sha1_hmac_ctx_t ctx; - sha1_hmac_init_vector_64 (&ctx, w0, w1, w2, w3); + sha1_hmac_init_64 (&ctx, w0, w1, w2, w3); sha1_hmac_update_global_swap (&ctx, ikepsk_bufs[digests_offset].msg_buf, ikepsk_bufs[digests_offset].msg_len); From a9fed50ce0e7f04847c679d8f5bebf9cbedaa0e1 Mon Sep 17 00:00:00 2001 From: jsteube Date: Mon, 31 Jul 2017 15:29:28 +0200 Subject: [PATCH 32/75] Add pure kernels for NetNTLMv2 --- OpenCL/m05600_a0.cl | 245 ++++++++++++++++++++++++++++++++++++++++++ OpenCL/m05600_a1.cl | 221 ++++++++++++++++++++++++++++++++++++++ OpenCL/m05600_a3.cl | 255 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 721 insertions(+) create mode 100644 OpenCL/m05600_a0.cl create mode 100644 OpenCL/m05600_a1.cl create mode 100644 OpenCL/m05600_a3.cl diff --git a/OpenCL/m05600_a0.cl b/OpenCL/m05600_a0.cl new file mode 100644 index 000000000..d839d314d --- /dev/null +++ b/OpenCL/m05600_a0.cl @@ -0,0 +1,245 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_md4.cl" +#include "inc_hash_md5.cl" + +__kernel void m05600_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const netntlm_t *netntlm_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md4_ctx_t ctx1; + + md4_init (&ctx1); + + md4_update_utf16le (&ctx1, w, pw_len); + + md4_final (&ctx1); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx1.h[0]; + w0[1] = ctx1.h[1]; + w0[2] = ctx1.h[2]; + w0[3] = ctx1.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx0; + + md5_hmac_init_64 (&ctx0, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx0, netntlm_bufs[digests_offset].userdomain_buf, netntlm_bufs[digests_offset].user_len + netntlm_bufs[digests_offset].domain_len); + + md5_hmac_final (&ctx0); + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx, netntlm_bufs[digests_offset].chall_buf, netntlm_bufs[digests_offset].srvchall_len + netntlm_bufs[digests_offset].clichall_len); + + md5_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m05600_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const netntlm_t *netntlm_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md4_ctx_t ctx1; + + md4_init (&ctx1); + + md4_update_utf16le (&ctx1, w, pw_len); + + md4_final (&ctx1); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx1.h[0]; + w0[1] = ctx1.h[1]; + w0[2] = ctx1.h[2]; + w0[3] = ctx1.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx0; + + md5_hmac_init_64 (&ctx0, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx0, netntlm_bufs[digests_offset].userdomain_buf, netntlm_bufs[digests_offset].user_len + netntlm_bufs[digests_offset].domain_len); + + md5_hmac_final (&ctx0); + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx, netntlm_bufs[digests_offset].chall_buf, netntlm_bufs[digests_offset].srvchall_len + netntlm_bufs[digests_offset].clichall_len); + + md5_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m05600_a1.cl b/OpenCL/m05600_a1.cl new file mode 100644 index 000000000..5f494e6d7 --- /dev/null +++ b/OpenCL/m05600_a1.cl @@ -0,0 +1,221 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_md4.cl" +#include "inc_hash_md5.cl" + +__kernel void m05600_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const netntlm_t *netntlm_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + md4_ctx_t ctx10; + + md4_init (&ctx10); + + md4_update_global_utf16le (&ctx10, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md4_ctx_t ctx1 = ctx10; + + md4_update_global_utf16le (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md4_final (&ctx1); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx1.h[0]; + w0[1] = ctx1.h[1]; + w0[2] = ctx1.h[2]; + w0[3] = ctx1.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx0; + + md5_hmac_init_64 (&ctx0, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx0, netntlm_bufs[digests_offset].userdomain_buf, netntlm_bufs[digests_offset].user_len + netntlm_bufs[digests_offset].domain_len); + + md5_hmac_final (&ctx0); + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx, netntlm_bufs[digests_offset].chall_buf, netntlm_bufs[digests_offset].srvchall_len + netntlm_bufs[digests_offset].clichall_len); + + md5_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m05600_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const netntlm_t *netntlm_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + md4_ctx_t ctx10; + + md4_init (&ctx10); + + md4_update_global_utf16le (&ctx10, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md4_ctx_t ctx1 = ctx10; + + md4_update_global_utf16le (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md4_final (&ctx1); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx1.h[0]; + w0[1] = ctx1.h[1]; + w0[2] = ctx1.h[2]; + w0[3] = ctx1.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx0; + + md5_hmac_init_64 (&ctx0, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx0, netntlm_bufs[digests_offset].userdomain_buf, netntlm_bufs[digests_offset].user_len + netntlm_bufs[digests_offset].domain_len); + + md5_hmac_final (&ctx0); + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx, netntlm_bufs[digests_offset].chall_buf, netntlm_bufs[digests_offset].srvchall_len + netntlm_bufs[digests_offset].clichall_len); + + md5_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m05600_a3.cl b/OpenCL/m05600_a3.cl new file mode 100644 index 000000000..df7ea791c --- /dev/null +++ b/OpenCL/m05600_a3.cl @@ -0,0 +1,255 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_md4.cl" +#include "inc_hash_md5.cl" + +__kernel void m05600_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const netntlm_t *netntlm_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + u32 w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32 w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32 w0lr = w0l | w0r; + + w[0] = w0lr; + + md4_ctx_t ctx1; + + md4_init (&ctx1); + + md4_update_utf16le (&ctx1, w, pw_len); + + md4_final (&ctx1); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx1.h[0]; + w0[1] = ctx1.h[1]; + w0[2] = ctx1.h[2]; + w0[3] = ctx1.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx0; + + md5_hmac_init (&ctx0, w, pw_len); + + md5_hmac_update_global (&ctx0, netntlm_bufs[digests_offset].userdomain_buf, netntlm_bufs[digests_offset].user_len + netntlm_bufs[digests_offset].domain_len); + + md5_hmac_final (&ctx0); + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx, netntlm_bufs[digests_offset].chall_buf, netntlm_bufs[digests_offset].srvchall_len + netntlm_bufs[digests_offset].clichall_len); + + md5_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m05600_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const netntlm_t *netntlm_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + u32 w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32 w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32 w0lr = w0l | w0r; + + w[0] = w0lr; + + md4_ctx_t ctx1; + + md4_init (&ctx1); + + md4_update_utf16le (&ctx1, w, pw_len); + + md4_final (&ctx1); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = ctx1.h[0]; + w0[1] = ctx1.h[1]; + w0[2] = ctx1.h[2]; + w0[3] = ctx1.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx0; + + md5_hmac_init (&ctx0, w, pw_len); + + md5_hmac_update_global (&ctx0, netntlm_bufs[digests_offset].userdomain_buf, netntlm_bufs[digests_offset].user_len + netntlm_bufs[digests_offset].domain_len); + + md5_hmac_final (&ctx0); + + w0[0] = ctx0.opad.h[0]; + w0[1] = ctx0.opad.h[1]; + w0[2] = ctx0.opad.h[2]; + w0[3] = ctx0.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + md5_hmac_update_global (&ctx, netntlm_bufs[digests_offset].chall_buf, netntlm_bufs[digests_offset].srvchall_len + netntlm_bufs[digests_offset].clichall_len); + + md5_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} From 6946329b029f0ae5199139e2903e8ae1944ba222 Mon Sep 17 00:00:00 2001 From: jsteube Date: Mon, 31 Jul 2017 15:37:49 +0200 Subject: [PATCH 33/75] Fix BF pure kernels for NetNTLMv2 --- OpenCL/m05600_a3.cl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/OpenCL/m05600_a3.cl b/OpenCL/m05600_a3.cl index df7ea791c..c963a3d66 100644 --- a/OpenCL/m05600_a3.cl +++ b/OpenCL/m05600_a3.cl @@ -88,7 +88,7 @@ __kernel void m05600_mxx (__global pw_t *pws, __global const kernel_rule_t *rule md5_hmac_ctx_t ctx0; - md5_hmac_init (&ctx0, w, pw_len); + md5_hmac_init_64 (&ctx0, w0, w1, w2, w3); md5_hmac_update_global (&ctx0, netntlm_bufs[digests_offset].userdomain_buf, netntlm_bufs[digests_offset].user_len + netntlm_bufs[digests_offset].domain_len); @@ -214,7 +214,7 @@ __kernel void m05600_sxx (__global pw_t *pws, __global const kernel_rule_t *rule md5_hmac_ctx_t ctx0; - md5_hmac_init (&ctx0, w, pw_len); + md5_hmac_init_64 (&ctx0, w0, w1, w2, w3); md5_hmac_update_global (&ctx0, netntlm_bufs[digests_offset].userdomain_buf, netntlm_bufs[digests_offset].user_len + netntlm_bufs[digests_offset].domain_len); From cbd8f81a1c5c2ec0da0993493d95ffaf10da2df8 Mon Sep 17 00:00:00 2001 From: jsteube Date: Tue, 1 Aug 2017 09:59:20 +0200 Subject: [PATCH 34/75] Add pure kernels for RipeMD160 --- OpenCL/m06000_a0.cl | 134 +++++++++++++++++++++++++++++++++++++++++ OpenCL/m06000_a1.cl | 110 +++++++++++++++++++++++++++++++++ OpenCL/m06000_a3.cl | 144 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 388 insertions(+) create mode 100644 OpenCL/m06000_a0.cl create mode 100644 OpenCL/m06000_a1.cl create mode 100644 OpenCL/m06000_a3.cl diff --git a/OpenCL/m06000_a0.cl b/OpenCL/m06000_a0.cl new file mode 100644 index 000000000..6f4c1d1a0 --- /dev/null +++ b/OpenCL/m06000_a0.cl @@ -0,0 +1,134 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_ripemd160.cl" + +__kernel void m06000_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + ripemd160_ctx_t ctx; + + ripemd160_init (&ctx); + + ripemd160_update (&ctx, w, pw_len); + + ripemd160_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m06000_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + ripemd160_ctx_t ctx; + + ripemd160_init (&ctx); + + ripemd160_update (&ctx, w, pw_len); + + ripemd160_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m06000_a1.cl b/OpenCL/m06000_a1.cl new file mode 100644 index 000000000..9aa0a3c17 --- /dev/null +++ b/OpenCL/m06000_a1.cl @@ -0,0 +1,110 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_ripemd160.cl" + +__kernel void m06000_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + ripemd160_ctx_t ctx0; + + ripemd160_init (&ctx0); + + ripemd160_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + ripemd160_ctx_t ctx = ctx0; + + ripemd160_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + ripemd160_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m06000_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + ripemd160_ctx_t ctx0; + + ripemd160_init (&ctx0); + + ripemd160_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + ripemd160_ctx_t ctx = ctx0; + + ripemd160_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + ripemd160_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m06000_a3.cl b/OpenCL/m06000_a3.cl new file mode 100644 index 000000000..e3b171d63 --- /dev/null +++ b/OpenCL/m06000_a3.cl @@ -0,0 +1,144 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_ripemd160.cl" + +__kernel void m06000_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + ripemd160_ctx_vector_t ctx; + + ripemd160_init_vector (&ctx); + + ripemd160_update_vector (&ctx, w, pw_len); + + ripemd160_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m06000_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + ripemd160_ctx_vector_t ctx; + + ripemd160_init_vector (&ctx); + + ripemd160_update_vector (&ctx, w, pw_len); + + ripemd160_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} From d573a73072075803d199ef87025994db602a1f75 Mon Sep 17 00:00:00 2001 From: jsteube Date: Tue, 1 Aug 2017 10:23:14 +0200 Subject: [PATCH 35/75] Use MIN() for setting pw_max --- src/interface.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/src/interface.c b/src/interface.c index 0aaa2f488..435353242 100644 --- a/src/interface.c +++ b/src/interface.c @@ -24642,9 +24642,9 @@ int hashconfig_init (hashcat_ctx_t *hashcat_ctx) { switch (user_options_extra->attack_kern) { - case ATTACK_KERN_STRAIGHT: if (hashconfig->pw_max > PW_DICTMAX) hashconfig->pw_max = PW_DICTMAX; + case ATTACK_KERN_STRAIGHT: hashconfig->pw_max = MIN (hashconfig->pw_max, PW_DICTMAX); break; - case ATTACK_KERN_COMBI: if (hashconfig->pw_max > PW_DICTMAX) hashconfig->pw_max = PW_DICTMAX; + case ATTACK_KERN_COMBI: hashconfig->pw_max = MIN (hashconfig->pw_max, PW_DICTMAX); break; } } @@ -24654,9 +24654,9 @@ int hashconfig_init (hashcat_ctx_t *hashcat_ctx) { switch (user_options_extra->attack_kern) { - case ATTACK_KERN_STRAIGHT: if (hashconfig->pw_max > PW_DICTMAX) hashconfig->pw_max = PW_DICTMAX; + case ATTACK_KERN_STRAIGHT: hashconfig->pw_max = MIN (hashconfig->pw_max, PW_DICTMAX); break; - case ATTACK_KERN_COMBI: if (hashconfig->pw_max > PW_DICTMAX) hashconfig->pw_max = PW_DICTMAX; + case ATTACK_KERN_COMBI: hashconfig->pw_max = MIN (hashconfig->pw_max, PW_DICTMAX); break; } } @@ -24668,27 +24668,29 @@ int hashconfig_init (hashcat_ctx_t *hashcat_ctx) switch (hashconfig->hash_mode) { - case 500: hashconfig->pw_max = 15; // -L available + case 500: hashconfig->pw_max = MIN (hashconfig->pw_max, 15); // pure kernel available break; - case 1600: hashconfig->pw_max = 15; // -L available + case 1600: hashconfig->pw_max = MIN (hashconfig->pw_max, 15); // pure kernel available break; - case 1800: hashconfig->pw_max = 16; // -L available + case 1800: hashconfig->pw_max = MIN (hashconfig->pw_max, 16); // pure kernel available break; - case 5800: hashconfig->pw_max = 16; // -L available + case 5800: hashconfig->pw_max = MIN (hashconfig->pw_max, 16); // pure kernel available break; - case 6300: hashconfig->pw_max = 15; // -L available + case 6300: hashconfig->pw_max = MIN (hashconfig->pw_max, 15); // pure kernel available break; - case 7000: hashconfig->pw_max = 19; // todo + case 6900: hashconfig->pw_max = MIN (hashconfig->pw_max, 32); // todo break; - case 7400: hashconfig->pw_max = 15; // -L available + case 7000: hashconfig->pw_max = MIN (hashconfig->pw_max, 19); // todo break; - case 10700: hashconfig->pw_max = 16; // -L available + case 7400: hashconfig->pw_max = MIN (hashconfig->pw_max, 15); // pure kernel available break; - case 12500: hashconfig->pw_max = 20; // todo + case 10700: hashconfig->pw_max = MIN (hashconfig->pw_max, 16); // pure kernel available break; - case 14400: hashconfig->pw_max = 24; // todo + case 12500: hashconfig->pw_max = MIN (hashconfig->pw_max, 20); // todo break; - case 15500: hashconfig->pw_max = 16; // todo + case 14400: hashconfig->pw_max = MIN (hashconfig->pw_max, 24); // todo + break; + case 15500: hashconfig->pw_max = MIN (hashconfig->pw_max, 16); // todo break; } } From 6379e76bc8f3191010cfb4ea5288dc99aa5c4cb8 Mon Sep 17 00:00:00 2001 From: jsteube Date: Tue, 1 Aug 2017 12:12:24 +0200 Subject: [PATCH 36/75] No need for handling OPTS_TYPE_PT_ADD80 and OPTS_TYPE_PT_ADD01 in non-optimized combinator mode --- src/opencl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/opencl.c b/src/opencl.c index a37be66ae..ee9ab76ef 100644 --- a/src/opencl.c +++ b/src/opencl.c @@ -2190,6 +2190,7 @@ int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, co uppercase (ptr, line_len); } + /* if (combinator_ctx->combs_mode == COMBINATOR_MODE_BASE_LEFT) { if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80) @@ -2202,6 +2203,7 @@ int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, co ptr[line_len] = 0x01; } } + */ device_param->combs_buf[i].pw_len = line_len; From 83d37ebeff8758dee0a7d74c6e40607c5ff95462 Mon Sep 17 00:00:00 2001 From: jsteube Date: Tue, 1 Aug 2017 14:16:27 +0200 Subject: [PATCH 37/75] Add pure kernels for FortiGate (FortiOS) --- OpenCL/m07000_a0.cl | 199 +++++++++++++++++++++++++++++++++++++++++ OpenCL/m07000_a1.cl | 170 +++++++++++++++++++++++++++++++++++ OpenCL/m07000_a3.cl | 212 ++++++++++++++++++++++++++++++++++++++++++++ src/interface.c | 2 +- 4 files changed, 582 insertions(+), 1 deletion(-) create mode 100644 OpenCL/m07000_a0.cl create mode 100644 OpenCL/m07000_a1.cl create mode 100644 OpenCL/m07000_a3.cl diff --git a/OpenCL/m07000_a0.cl b/OpenCL/m07000_a0.cl new file mode 100644 index 000000000..40201efbf --- /dev/null +++ b/OpenCL/m07000_a0.cl @@ -0,0 +1,199 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m07000_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx = ctx0; + + sha1_update_swap (&ctx, w, pw_len); + + /** + * pepper + */ + + u32 p0[4]; + u32 p1[4]; + u32 p2[4]; + u32 p3[4]; + + p0[0] = swap32_S (FORTIGATE_A); + p0[1] = swap32_S (FORTIGATE_B); + p0[2] = swap32_S (FORTIGATE_C); + p0[3] = swap32_S (FORTIGATE_D); + p1[0] = swap32_S (FORTIGATE_E); + p1[1] = swap32_S (FORTIGATE_F); + p1[2] = 0; + p1[3] = 0; + p2[0] = 0; + p2[1] = 0; + p2[2] = 0; + p2[3] = 0; + p3[0] = 0; + p3[1] = 0; + p3[2] = 0; + p3[3] = 0; + + sha1_update_64 (&ctx, p0, p1, p2, p3, 24); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m07000_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx = ctx0; + + sha1_update_swap (&ctx, w, pw_len); + + /** + * pepper + */ + + u32 p0[4]; + u32 p1[4]; + u32 p2[4]; + u32 p3[4]; + + p0[0] = swap32_S (FORTIGATE_A); + p0[1] = swap32_S (FORTIGATE_B); + p0[2] = swap32_S (FORTIGATE_C); + p0[3] = swap32_S (FORTIGATE_D); + p1[0] = swap32_S (FORTIGATE_E); + p1[1] = swap32_S (FORTIGATE_F); + p1[2] = 0; + p1[3] = 0; + p2[0] = 0; + p2[1] = 0; + p2[2] = 0; + p2[3] = 0; + p3[0] = 0; + p3[1] = 0; + p3[2] = 0; + p3[3] = 0; + + sha1_update_64 (&ctx, p0, p1, p2, p3, 24); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m07000_a1.cl b/OpenCL/m07000_a1.cl new file mode 100644 index 000000000..34e434180 --- /dev/null +++ b/OpenCL/m07000_a1.cl @@ -0,0 +1,170 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m07000_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx = ctx0; + + sha1_update_global_swap (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + /** + * pepper + */ + + u32 p0[4]; + u32 p1[4]; + u32 p2[4]; + u32 p3[4]; + + p0[0] = swap32_S (FORTIGATE_A); + p0[1] = swap32_S (FORTIGATE_B); + p0[2] = swap32_S (FORTIGATE_C); + p0[3] = swap32_S (FORTIGATE_D); + p1[0] = swap32_S (FORTIGATE_E); + p1[1] = swap32_S (FORTIGATE_F); + p1[2] = 0; + p1[3] = 0; + p2[0] = 0; + p2[1] = 0; + p2[2] = 0; + p2[3] = 0; + p3[0] = 0; + p3[1] = 0; + p3[2] = 0; + p3[3] = 0; + + sha1_update_64 (&ctx, p0, p1, p2, p3, 24); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m07000_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx = ctx0; + + sha1_update_global_swap (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + /** + * pepper + */ + + u32 p0[4]; + u32 p1[4]; + u32 p2[4]; + u32 p3[4]; + + p0[0] = swap32_S (FORTIGATE_A); + p0[1] = swap32_S (FORTIGATE_B); + p0[2] = swap32_S (FORTIGATE_C); + p0[3] = swap32_S (FORTIGATE_D); + p1[0] = swap32_S (FORTIGATE_E); + p1[1] = swap32_S (FORTIGATE_F); + p1[2] = 0; + p1[3] = 0; + p2[0] = 0; + p2[1] = 0; + p2[2] = 0; + p2[3] = 0; + p3[0] = 0; + p3[1] = 0; + p3[2] = 0; + p3[3] = 0; + + sha1_update_64 (&ctx, p0, p1, p2, p3, 24); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m07000_a3.cl b/OpenCL/m07000_a3.cl new file mode 100644 index 000000000..e80136832 --- /dev/null +++ b/OpenCL/m07000_a3.cl @@ -0,0 +1,212 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_sha1.cl" + +__kernel void m07000_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx; + + sha1_init_vector_from_scalar (&ctx, &ctx0); + + sha1_update_vector_swap (&ctx, w, pw_len); + + /** + * pepper + */ + + u32x p0[4]; + u32x p1[4]; + u32x p2[4]; + u32x p3[4]; + + p0[0] = swap32 (FORTIGATE_A); + p0[1] = swap32 (FORTIGATE_B); + p0[2] = swap32 (FORTIGATE_C); + p0[3] = swap32 (FORTIGATE_D); + p1[0] = swap32 (FORTIGATE_E); + p1[1] = swap32 (FORTIGATE_F); + p1[2] = 0; + p1[3] = 0; + p2[0] = 0; + p2[1] = 0; + p2[2] = 0; + p2[3] = 0; + p3[0] = 0; + p3[1] = 0; + p3[2] = 0; + p3[3] = 0; + + sha1_update_vector_64 (&ctx, p0, p1, p2, p3, 24); + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m07000_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx; + + sha1_init_vector_from_scalar (&ctx, &ctx0); + + sha1_update_vector_swap (&ctx, w, pw_len); + + /** + * pepper + */ + + u32x p0[4]; + u32x p1[4]; + u32x p2[4]; + u32x p3[4]; + + p0[0] = swap32 (FORTIGATE_A); + p0[1] = swap32 (FORTIGATE_B); + p0[2] = swap32 (FORTIGATE_C); + p0[3] = swap32 (FORTIGATE_D); + p1[0] = swap32 (FORTIGATE_E); + p1[1] = swap32 (FORTIGATE_F); + p1[2] = 0; + p1[3] = 0; + p2[0] = 0; + p2[1] = 0; + p2[2] = 0; + p2[3] = 0; + p3[0] = 0; + p3[1] = 0; + p3[2] = 0; + p3[3] = 0; + + sha1_update_vector_64 (&ctx, p0, p1, p2, p3, 24); + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} diff --git a/src/interface.c b/src/interface.c index 435353242..d5aca61ca 100644 --- a/src/interface.c +++ b/src/interface.c @@ -24680,7 +24680,7 @@ int hashconfig_init (hashcat_ctx_t *hashcat_ctx) break; case 6900: hashconfig->pw_max = MIN (hashconfig->pw_max, 32); // todo break; - case 7000: hashconfig->pw_max = MIN (hashconfig->pw_max, 19); // todo + case 7000: hashconfig->pw_max = MIN (hashconfig->pw_max, 19); // pure kernel available break; case 7400: hashconfig->pw_max = MIN (hashconfig->pw_max, 15); // pure kernel available break; From 1eb249c5b4b364c76cfe1fd830a2f3f26f209cc5 Mon Sep 17 00:00:00 2001 From: jsteube Date: Tue, 1 Aug 2017 14:42:28 +0200 Subject: [PATCH 38/75] Add pure kernels for IPMI2 RAKP HMAC-SHA1 --- OpenCL/m07300_a0.cl | 134 ++++++++++++++++++++++++++++++++++ OpenCL/m07300_a1.cl | 172 ++++++++++++++++++++++++++++++++++++++++++++ OpenCL/m07300_a3.cl | 144 +++++++++++++++++++++++++++++++++++++ 3 files changed, 450 insertions(+) create mode 100644 OpenCL/m07300_a0.cl create mode 100644 OpenCL/m07300_a1.cl create mode 100644 OpenCL/m07300_a3.cl diff --git a/OpenCL/m07300_a0.cl b/OpenCL/m07300_a0.cl new file mode 100644 index 000000000..1b78ea340 --- /dev/null +++ b/OpenCL/m07300_a0.cl @@ -0,0 +1,134 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m07300_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const rakp_t *rakp_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_hmac_ctx_t ctx; + + sha1_hmac_init_swap (&ctx, w, pw_len); + + sha1_hmac_update_global (&ctx, rakp_bufs[digests_offset].salt_buf, rakp_bufs[digests_offset].salt_len); + + sha1_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m07300_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const rakp_t *rakp_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_hmac_ctx_t ctx; + + sha1_hmac_init_swap (&ctx, w, pw_len); + + sha1_hmac_update_global (&ctx, rakp_bufs[digests_offset].salt_buf, rakp_bufs[digests_offset].salt_len); + + sha1_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m07300_a1.cl b/OpenCL/m07300_a1.cl new file mode 100644 index 000000000..3e8e63a71 --- /dev/null +++ b/OpenCL/m07300_a1.cl @@ -0,0 +1,172 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m07300_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const rakp_t *rakp_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = swap32_S (pws[gid].i[idx]); + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + const u32 comb_len = combs_buf[il_pos].pw_len; + + u32 c[64]; + + #ifdef _unroll + #pragma unroll + #endif + for (int idx = 0; idx < 64; idx++) + { + c[idx] = swap32_S (combs_buf[il_pos].i[idx]); + } + + switch_buffer_by_offset_1x64_be_S (c, pw_len); + + #ifdef _unroll + #pragma unroll + #endif + for (int i = 0; i < 64; i++) + { + c[i] |= w[i]; + } + + sha1_hmac_ctx_t ctx; + + sha1_hmac_init (&ctx, c, pw_len + comb_len); + + sha1_hmac_update_global (&ctx, rakp_bufs[digests_offset].salt_buf, rakp_bufs[digests_offset].salt_len); + + sha1_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m07300_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const rakp_t *rakp_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = swap32_S (pws[gid].i[idx]); + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + const u32 comb_len = combs_buf[il_pos].pw_len; + + u32 c[64]; + + #ifdef _unroll + #pragma unroll + #endif + for (int idx = 0; idx < 64; idx++) + { + c[idx] = swap32_S (combs_buf[il_pos].i[idx]); + } + + switch_buffer_by_offset_1x64_be_S (c, pw_len); + + #ifdef _unroll + #pragma unroll + #endif + for (int i = 0; i < 64; i++) + { + c[i] |= w[i]; + } + + sha1_hmac_ctx_t ctx; + + sha1_hmac_init (&ctx, c, pw_len + comb_len); + + sha1_hmac_update_global (&ctx, rakp_bufs[digests_offset].salt_buf, rakp_bufs[digests_offset].salt_len); + + sha1_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m07300_a3.cl b/OpenCL/m07300_a3.cl new file mode 100644 index 000000000..ff2f6f0b1 --- /dev/null +++ b/OpenCL/m07300_a3.cl @@ -0,0 +1,144 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m07300_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const rakp_t *rakp_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + u32 w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32 w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32 w0 = w0l | w0r; + + w[0] = w0; + + sha1_hmac_ctx_t ctx; + + sha1_hmac_init (&ctx, w, pw_len); + + sha1_hmac_update_global (&ctx, rakp_bufs[digests_offset].salt_buf, rakp_bufs[digests_offset].salt_len); + + sha1_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m07300_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const rakp_t *rakp_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + u32 w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32 w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32 w0 = w0l | w0r; + + w[0] = w0; + + sha1_hmac_ctx_t ctx; + + sha1_hmac_init (&ctx, w, pw_len); + + sha1_hmac_update_global (&ctx, rakp_bufs[digests_offset].salt_buf, rakp_bufs[digests_offset].salt_len); + + sha1_hmac_final (&ctx); + + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} From 2802f1d59207041326b030062f8df090030971d6 Mon Sep 17 00:00:00 2001 From: jsteube Date: Tue, 1 Aug 2017 14:56:09 +0200 Subject: [PATCH 39/75] Fix vector function calls and datatypes --- OpenCL/m00050_a1.cl | 16 +++--- OpenCL/m00150_a1.cl | 16 +++--- OpenCL/m01100_a0.cl | 4 +- OpenCL/m01100_a1.cl | 4 +- OpenCL/m01450_a1.cl | 16 +++--- .../{m01500_a0.cl => m01500_a0-optimized.cl} | 0 .../{m01500_a1.cl => m01500_a1-optimized.cl} | 0 .../{m01500_a3.cl => m01500_a3-optimized.cl} | 0 OpenCL/m01750_a1.cl | 16 +++--- .../{m02000_a0.cl => m02000_a0-optimized.cl} | 0 .../{m02000_a1.cl => m02000_a1-optimized.cl} | 0 .../{m02000_a3.cl => m02000_a3-optimized.cl} | 0 OpenCL/m02610_a0.cl | 4 +- OpenCL/m02610_a1.cl | 4 +- OpenCL/m02810_a0.cl | 4 +- OpenCL/m02810_a1.cl | 4 +- OpenCL/m02810_a3.cl | 56 +++++++++---------- .../{m03000_a0.cl => m03000_a0-optimized.cl} | 0 .../{m03000_a1.cl => m03000_a1-optimized.cl} | 0 .../{m03000_a3.cl => m03000_a3-optimized.cl} | 0 OpenCL/m03710_a0.cl | 4 +- OpenCL/m03710_a1.cl | 4 +- OpenCL/m03910_a0.cl | 4 +- OpenCL/m03910_a1.cl | 4 +- OpenCL/m04310_a0.cl | 4 +- OpenCL/m04310_a1.cl | 4 +- OpenCL/m04800_a0.cl | 4 +- OpenCL/m04800_a1.cl | 4 +- OpenCL/m05400_a0.cl | 16 +++--- OpenCL/m05500_a0.cl | 42 +++++++------- OpenCL/m05500_a1.cl | 42 +++++++------- OpenCL/m10800_a0.cl | 16 +++--- OpenCL/m10800_a1.cl | 16 +++--- 33 files changed, 154 insertions(+), 154 deletions(-) rename OpenCL/{m01500_a0.cl => m01500_a0-optimized.cl} (100%) rename OpenCL/{m01500_a1.cl => m01500_a1-optimized.cl} (100%) rename OpenCL/{m01500_a3.cl => m01500_a3-optimized.cl} (100%) rename OpenCL/{m02000_a0.cl => m02000_a0-optimized.cl} (100%) rename OpenCL/{m02000_a1.cl => m02000_a1-optimized.cl} (100%) rename OpenCL/{m02000_a3.cl => m02000_a3-optimized.cl} (100%) rename OpenCL/{m03000_a0.cl => m03000_a0-optimized.cl} (100%) rename OpenCL/{m03000_a1.cl => m03000_a1-optimized.cl} (100%) rename OpenCL/{m03000_a3.cl => m03000_a3-optimized.cl} (100%) diff --git a/OpenCL/m00050_a1.cl b/OpenCL/m00050_a1.cl index 7d75b6e4f..de0739fb6 100644 --- a/OpenCL/m00050_a1.cl +++ b/OpenCL/m00050_a1.cl @@ -82,13 +82,13 @@ __kernel void m00050_mxx (__global pw_t *pws, __global const kernel_rule_t *rule c[i] |= w[i]; } - md5_hmac_ctx_vector_t ctx; + md5_hmac_ctx_t ctx; - md5_hmac_init_vector (&ctx, c, pw_len + comb_len); + md5_hmac_init (&ctx, c, pw_len + comb_len); - md5_hmac_update_vector (&ctx, s, salt_len); + md5_hmac_update (&ctx, s, salt_len); - md5_hmac_final_vector (&ctx); + md5_hmac_final (&ctx); const u32 r0 = ctx.opad.h[DGST_R0]; const u32 r1 = ctx.opad.h[DGST_R1]; @@ -180,13 +180,13 @@ __kernel void m00050_sxx (__global pw_t *pws, __global const kernel_rule_t *rule c[i] |= w[i]; } - md5_hmac_ctx_vector_t ctx; + md5_hmac_ctx_t ctx; - md5_hmac_init_vector (&ctx, c, pw_len + comb_len); + md5_hmac_init (&ctx, c, pw_len + comb_len); - md5_hmac_update_vector (&ctx, s, salt_len); + md5_hmac_update (&ctx, s, salt_len); - md5_hmac_final_vector (&ctx); + md5_hmac_final (&ctx); const u32 r0 = ctx.opad.h[DGST_R0]; const u32 r1 = ctx.opad.h[DGST_R1]; diff --git a/OpenCL/m00150_a1.cl b/OpenCL/m00150_a1.cl index 15ab612bf..96153a25c 100644 --- a/OpenCL/m00150_a1.cl +++ b/OpenCL/m00150_a1.cl @@ -82,13 +82,13 @@ __kernel void m00150_mxx (__global pw_t *pws, __global const kernel_rule_t *rule c[i] |= w[i]; } - sha1_hmac_ctx_vector_t ctx; + sha1_hmac_ctx_t ctx; - sha1_hmac_init_vector (&ctx, c, pw_len + comb_len); + sha1_hmac_init (&ctx, c, pw_len + comb_len); - sha1_hmac_update_vector (&ctx, s, salt_len); + sha1_hmac_update (&ctx, s, salt_len); - sha1_hmac_final_vector (&ctx); + sha1_hmac_final (&ctx); const u32 r0 = ctx.opad.h[DGST_R0]; const u32 r1 = ctx.opad.h[DGST_R1]; @@ -180,13 +180,13 @@ __kernel void m00150_sxx (__global pw_t *pws, __global const kernel_rule_t *rule c[i] |= w[i]; } - sha1_hmac_ctx_vector_t ctx; + sha1_hmac_ctx_t ctx; - sha1_hmac_init_vector (&ctx, c, pw_len + comb_len); + sha1_hmac_init (&ctx, c, pw_len + comb_len); - sha1_hmac_update_vector (&ctx, s, salt_len); + sha1_hmac_update (&ctx, s, salt_len); - sha1_hmac_final_vector (&ctx); + sha1_hmac_final (&ctx); const u32 r0 = ctx.opad.h[DGST_R0]; const u32 r1 = ctx.opad.h[DGST_R1]; diff --git a/OpenCL/m01100_a0.cl b/OpenCL/m01100_a0.cl index e257ed697..085e29e18 100644 --- a/OpenCL/m01100_a0.cl +++ b/OpenCL/m01100_a0.cl @@ -47,7 +47,7 @@ __kernel void m01100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { @@ -140,7 +140,7 @@ __kernel void m01100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { diff --git a/OpenCL/m01100_a1.cl b/OpenCL/m01100_a1.cl index 56fa3bfda..6c7ef2d9e 100644 --- a/OpenCL/m01100_a1.cl +++ b/OpenCL/m01100_a1.cl @@ -32,7 +32,7 @@ __kernel void m01100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { @@ -114,7 +114,7 @@ __kernel void m01100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { diff --git a/OpenCL/m01450_a1.cl b/OpenCL/m01450_a1.cl index c482c7626..25e81d266 100644 --- a/OpenCL/m01450_a1.cl +++ b/OpenCL/m01450_a1.cl @@ -82,13 +82,13 @@ __kernel void m01450_mxx (__global pw_t *pws, __global const kernel_rule_t *rule c[i] |= w[i]; } - sha256_hmac_ctx_vector_t ctx; + sha256_hmac_ctx_t ctx; - sha256_hmac_init_vector (&ctx, c, pw_len + comb_len); + sha256_hmac_init (&ctx, c, pw_len + comb_len); - sha256_hmac_update_vector (&ctx, s, salt_len); + sha256_hmac_update (&ctx, s, salt_len); - sha256_hmac_final_vector (&ctx); + sha256_hmac_final (&ctx); const u32 r0 = ctx.opad.h[DGST_R0]; const u32 r1 = ctx.opad.h[DGST_R1]; @@ -180,13 +180,13 @@ __kernel void m01450_sxx (__global pw_t *pws, __global const kernel_rule_t *rule c[i] |= w[i]; } - sha256_hmac_ctx_vector_t ctx; + sha256_hmac_ctx_t ctx; - sha256_hmac_init_vector (&ctx, c, pw_len + comb_len); + sha256_hmac_init (&ctx, c, pw_len + comb_len); - sha256_hmac_update_vector (&ctx, s, salt_len); + sha256_hmac_update (&ctx, s, salt_len); - sha256_hmac_final_vector (&ctx); + sha256_hmac_final (&ctx); const u32 r0 = ctx.opad.h[DGST_R0]; const u32 r1 = ctx.opad.h[DGST_R1]; diff --git a/OpenCL/m01500_a0.cl b/OpenCL/m01500_a0-optimized.cl similarity index 100% rename from OpenCL/m01500_a0.cl rename to OpenCL/m01500_a0-optimized.cl diff --git a/OpenCL/m01500_a1.cl b/OpenCL/m01500_a1-optimized.cl similarity index 100% rename from OpenCL/m01500_a1.cl rename to OpenCL/m01500_a1-optimized.cl diff --git a/OpenCL/m01500_a3.cl b/OpenCL/m01500_a3-optimized.cl similarity index 100% rename from OpenCL/m01500_a3.cl rename to OpenCL/m01500_a3-optimized.cl diff --git a/OpenCL/m01750_a1.cl b/OpenCL/m01750_a1.cl index 27be72d5e..d6d9da0fd 100644 --- a/OpenCL/m01750_a1.cl +++ b/OpenCL/m01750_a1.cl @@ -82,13 +82,13 @@ __kernel void m01750_mxx (__global pw_t *pws, __global const kernel_rule_t *rule c[i] |= w[i]; } - sha512_hmac_ctx_vector_t ctx; + sha512_hmac_ctx_t ctx; - sha512_hmac_init_vector (&ctx, c, pw_len + comb_len); + sha512_hmac_init (&ctx, c, pw_len + comb_len); - sha512_hmac_update_vector (&ctx, s, salt_len); + sha512_hmac_update (&ctx, s, salt_len); - sha512_hmac_final_vector (&ctx); + sha512_hmac_final (&ctx); const u32 r0 = l32_from_64_S (ctx.opad.h[7]); const u32 r1 = h32_from_64_S (ctx.opad.h[7]); @@ -180,13 +180,13 @@ __kernel void m01750_sxx (__global pw_t *pws, __global const kernel_rule_t *rule c[i] |= w[i]; } - sha512_hmac_ctx_vector_t ctx; + sha512_hmac_ctx_t ctx; - sha512_hmac_init_vector (&ctx, c, pw_len + comb_len); + sha512_hmac_init (&ctx, c, pw_len + comb_len); - sha512_hmac_update_vector (&ctx, s, salt_len); + sha512_hmac_update (&ctx, s, salt_len); - sha512_hmac_final_vector (&ctx); + sha512_hmac_final (&ctx); const u32 r0 = l32_from_64_S (ctx.opad.h[7]); const u32 r1 = h32_from_64_S (ctx.opad.h[7]); diff --git a/OpenCL/m02000_a0.cl b/OpenCL/m02000_a0-optimized.cl similarity index 100% rename from OpenCL/m02000_a0.cl rename to OpenCL/m02000_a0-optimized.cl diff --git a/OpenCL/m02000_a1.cl b/OpenCL/m02000_a1-optimized.cl similarity index 100% rename from OpenCL/m02000_a1.cl rename to OpenCL/m02000_a1-optimized.cl diff --git a/OpenCL/m02000_a3.cl b/OpenCL/m02000_a3-optimized.cl similarity index 100% rename from OpenCL/m02000_a3.cl rename to OpenCL/m02000_a3-optimized.cl diff --git a/OpenCL/m02610_a0.cl b/OpenCL/m02610_a0.cl index 633ea6f13..a09b5b41e 100644 --- a/OpenCL/m02610_a0.cl +++ b/OpenCL/m02610_a0.cl @@ -77,7 +77,7 @@ __kernel void m02610_mxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { @@ -205,7 +205,7 @@ __kernel void m02610_sxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { diff --git a/OpenCL/m02610_a1.cl b/OpenCL/m02610_a1.cl index 3818eb960..3fbdd72f9 100644 --- a/OpenCL/m02610_a1.cl +++ b/OpenCL/m02610_a1.cl @@ -62,7 +62,7 @@ __kernel void m02610_mxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { @@ -179,7 +179,7 @@ __kernel void m02610_sxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { diff --git a/OpenCL/m02810_a0.cl b/OpenCL/m02810_a0.cl index 02dfa50fd..502105952 100644 --- a/OpenCL/m02810_a0.cl +++ b/OpenCL/m02810_a0.cl @@ -77,7 +77,7 @@ __kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[8] = { 0 }; + u32 s[8] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { @@ -228,7 +228,7 @@ __kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[8] = { 0 }; + u32 s[8] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { diff --git a/OpenCL/m02810_a1.cl b/OpenCL/m02810_a1.cl index 28b4e271d..df010bf8d 100644 --- a/OpenCL/m02810_a1.cl +++ b/OpenCL/m02810_a1.cl @@ -62,7 +62,7 @@ __kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[8] = { 0 }; + u32 s[8] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { @@ -202,7 +202,7 @@ __kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[8] = { 0 }; + u32 s[8] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { diff --git a/OpenCL/m02810_a3.cl b/OpenCL/m02810_a3.cl index 0a8eff53b..48fa8911f 100644 --- a/OpenCL/m02810_a3.cl +++ b/OpenCL/m02810_a3.cl @@ -25,7 +25,7 @@ #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif -__kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32 *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * modifier @@ -62,7 +62,7 @@ __kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 pw_lenv = ceil ((float) pw_len / 4); - u32x w[64] = { 0 }; + u32 w[64] = { 0 }; for (int idx = 0; idx < pw_lenv; idx++) { @@ -75,7 +75,7 @@ __kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[8] = { 0 }; + u32 s[8] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { @@ -88,13 +88,13 @@ __kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rule * loop */ - u32x w0l = w[0]; + u32 w0l = w[0]; for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { - const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + const u32 w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32x w0lr = w0l | w0r; + const u32 w0lr = w0l | w0r; w[0] = w0lr; @@ -106,10 +106,10 @@ __kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rule md5_final_vector (&ctx0); - const u32x a = ctx0.h[0]; - const u32x b = ctx0.h[1]; - const u32x c = ctx0.h[2]; - const u32x d = ctx0.h[3]; + const u32 a = ctx0.h[0]; + const u32 b = ctx0.h[1]; + const u32 c = ctx0.h[2]; + const u32 d = ctx0.h[3]; md5_ctx_vector_t ctx; @@ -161,16 +161,16 @@ __kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rule md5_transform_vector (ctx.w0, ctx.w1, ctx.w2, ctx.w3, ctx.h); - const u32x r0 = ctx.h[DGST_R0]; - const u32x r1 = ctx.h[DGST_R1]; - const u32x r2 = ctx.h[DGST_R2]; - const u32x r3 = ctx.h[DGST_R3]; + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; COMPARE_M_SIMD (r0, r1, r2, r3); } } -__kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32 *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * modifier @@ -219,7 +219,7 @@ __kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 pw_lenv = ceil ((float) pw_len / 4); - u32x w[64] = { 0 }; + u32 w[64] = { 0 }; for (int idx = 0; idx < pw_lenv; idx++) { @@ -232,7 +232,7 @@ __kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[8] = { 0 }; + u32 s[8] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { @@ -245,13 +245,13 @@ __kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rule * loop */ - u32x w0l = w[0]; + u32 w0l = w[0]; for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { - const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + const u32 w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32x w0lr = w0l | w0r; + const u32 w0lr = w0l | w0r; w[0] = w0lr; @@ -263,10 +263,10 @@ __kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rule md5_final_vector (&ctx0); - const u32x a = ctx0.h[0]; - const u32x b = ctx0.h[1]; - const u32x c = ctx0.h[2]; - const u32x d = ctx0.h[3]; + const u32 a = ctx0.h[0]; + const u32 b = ctx0.h[1]; + const u32 c = ctx0.h[2]; + const u32 d = ctx0.h[3]; md5_ctx_vector_t ctx; @@ -318,10 +318,10 @@ __kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rule md5_transform_vector (ctx.w0, ctx.w1, ctx.w2, ctx.w3, ctx.h); - const u32x r0 = ctx.h[DGST_R0]; - const u32x r1 = ctx.h[DGST_R1]; - const u32x r2 = ctx.h[DGST_R2]; - const u32x r3 = ctx.h[DGST_R3]; + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; COMPARE_S_SIMD (r0, r1, r2, r3); } diff --git a/OpenCL/m03000_a0.cl b/OpenCL/m03000_a0-optimized.cl similarity index 100% rename from OpenCL/m03000_a0.cl rename to OpenCL/m03000_a0-optimized.cl diff --git a/OpenCL/m03000_a1.cl b/OpenCL/m03000_a1-optimized.cl similarity index 100% rename from OpenCL/m03000_a1.cl rename to OpenCL/m03000_a1-optimized.cl diff --git a/OpenCL/m03000_a3.cl b/OpenCL/m03000_a3-optimized.cl similarity index 100% rename from OpenCL/m03000_a3.cl rename to OpenCL/m03000_a3-optimized.cl diff --git a/OpenCL/m03710_a0.cl b/OpenCL/m03710_a0.cl index 054465f2d..89b894741 100644 --- a/OpenCL/m03710_a0.cl +++ b/OpenCL/m03710_a0.cl @@ -77,7 +77,7 @@ __kernel void m03710_mxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { @@ -218,7 +218,7 @@ __kernel void m03710_sxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { diff --git a/OpenCL/m03710_a1.cl b/OpenCL/m03710_a1.cl index b74fa951f..16e7caa5e 100644 --- a/OpenCL/m03710_a1.cl +++ b/OpenCL/m03710_a1.cl @@ -62,7 +62,7 @@ __kernel void m03710_mxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { @@ -192,7 +192,7 @@ __kernel void m03710_sxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { diff --git a/OpenCL/m03910_a0.cl b/OpenCL/m03910_a0.cl index ab08e1a87..1139e0cb6 100644 --- a/OpenCL/m03910_a0.cl +++ b/OpenCL/m03910_a0.cl @@ -77,7 +77,7 @@ __kernel void m03910_mxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[8] = { 0 }; + u32 s[8] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { @@ -228,7 +228,7 @@ __kernel void m03910_sxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[8] = { 0 }; + u32 s[8] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { diff --git a/OpenCL/m03910_a1.cl b/OpenCL/m03910_a1.cl index c4186f01e..40c9071dc 100644 --- a/OpenCL/m03910_a1.cl +++ b/OpenCL/m03910_a1.cl @@ -62,7 +62,7 @@ __kernel void m03910_mxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[8] = { 0 }; + u32 s[8] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { @@ -202,7 +202,7 @@ __kernel void m03910_sxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[8] = { 0 }; + u32 s[8] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { diff --git a/OpenCL/m04310_a0.cl b/OpenCL/m04310_a0.cl index 8ff005fa0..31d1af318 100644 --- a/OpenCL/m04310_a0.cl +++ b/OpenCL/m04310_a0.cl @@ -77,7 +77,7 @@ __kernel void m04310_mxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { @@ -205,7 +205,7 @@ __kernel void m04310_sxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { diff --git a/OpenCL/m04310_a1.cl b/OpenCL/m04310_a1.cl index aea272f93..bf33a9155 100644 --- a/OpenCL/m04310_a1.cl +++ b/OpenCL/m04310_a1.cl @@ -62,7 +62,7 @@ __kernel void m04310_mxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { @@ -179,7 +179,7 @@ __kernel void m04310_sxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 salt_lenv = ceil ((float) salt_len / 4); - u32x s[64] = { 0 }; + u32 s[64] = { 0 }; for (int idx = 0; idx < salt_lenv; idx++) { diff --git a/OpenCL/m04800_a0.cl b/OpenCL/m04800_a0.cl index 02c9903fa..cd7c9cc07 100644 --- a/OpenCL/m04800_a0.cl +++ b/OpenCL/m04800_a0.cl @@ -72,7 +72,7 @@ __kernel void m04800_mxx (__global pw_t *pws, __global const kernel_rule_t *rule md5_update (&ctx, w, pw_len); - md5_update_vector (&ctx, s, salt_len); + md5_update (&ctx, s, salt_len); md5_final (&ctx); @@ -154,7 +154,7 @@ __kernel void m04800_sxx (__global pw_t *pws, __global const kernel_rule_t *rule md5_update (&ctx, w, pw_len); - md5_update_vector (&ctx, s, salt_len); + md5_update (&ctx, s, salt_len); md5_final (&ctx); diff --git a/OpenCL/m04800_a1.cl b/OpenCL/m04800_a1.cl index d7e968d1c..b213e74c2 100644 --- a/OpenCL/m04800_a1.cl +++ b/OpenCL/m04800_a1.cl @@ -57,7 +57,7 @@ __kernel void m04800_mxx (__global pw_t *pws, __global const kernel_rule_t *rule md5_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); - md5_update_vector (&ctx, s, salt_len); + md5_update (&ctx, s, salt_len); md5_final (&ctx); @@ -126,7 +126,7 @@ __kernel void m04800_sxx (__global pw_t *pws, __global const kernel_rule_t *rule md5_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); - md5_update_vector (&ctx, s, salt_len); + md5_update (&ctx, s, salt_len); md5_final (&ctx); diff --git a/OpenCL/m05400_a0.cl b/OpenCL/m05400_a0.cl index 34b78379f..97d759c03 100644 --- a/OpenCL/m05400_a0.cl +++ b/OpenCL/m05400_a0.cl @@ -89,10 +89,10 @@ __kernel void m05400_mxx (__global pw_t *pws, __global const kernel_rule_t *rule sha1_hmac_final (&ctx); - const u32x r0 = ctx.opad.h[DGST_R0]; - const u32x r1 = ctx.opad.h[DGST_R1]; - const u32x r2 = ctx.opad.h[DGST_R2]; - const u32x r3 = ctx.opad.h[DGST_R3]; + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; COMPARE_M_SCALAR (r0, r1, r2, r3); } @@ -184,10 +184,10 @@ __kernel void m05400_sxx (__global pw_t *pws, __global const kernel_rule_t *rule sha1_hmac_final (&ctx); - const u32x r0 = ctx.opad.h[DGST_R0]; - const u32x r1 = ctx.opad.h[DGST_R1]; - const u32x r2 = ctx.opad.h[DGST_R2]; - const u32x r3 = ctx.opad.h[DGST_R3]; + const u32 r0 = ctx.opad.h[DGST_R0]; + const u32 r1 = ctx.opad.h[DGST_R1]; + const u32 r2 = ctx.opad.h[DGST_R2]; + const u32 r3 = ctx.opad.h[DGST_R3]; COMPARE_S_SCALAR (r0, r1, r2, r3); } diff --git a/OpenCL/m05500_a0.cl b/OpenCL/m05500_a0.cl index 3c8a790ae..3780e2e1b 100644 --- a/OpenCL/m05500_a0.cl +++ b/OpenCL/m05500_a0.cl @@ -344,18 +344,18 @@ __constant u32a c_skb[8][64] = #define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf]) #endif -void _des_crypt_encrypt (u32x iv[2], u32x data[2], u32x Kc[16], u32x Kd[16], __local u32 (*s_SPtrans)[64]) +void _des_crypt_encrypt (u32 iv[2], u32 data[2], u32 Kc[16], u32 Kd[16], __local u32 (*s_SPtrans)[64]) { - u32x r = data[0]; - u32x l = data[1]; + u32 r = data[0]; + u32 l = data[1]; #ifdef _unroll #pragma unroll #endif for (u32 i = 0; i < 16; i += 2) { - u32x u; - u32x t; + u32 u; + u32 t; u = Kc[i + 0] ^ rotl32 (r, 30u); t = Kd[i + 0] ^ rotl32 (r, 26u); @@ -386,9 +386,9 @@ void _des_crypt_encrypt (u32x iv[2], u32x data[2], u32x Kc[16], u32x Kd[16], __l iv[1] = r; } -void _des_crypt_keysetup (u32x c, u32x d, u32x Kc[16], u32x Kd[16], __local u32 (*s_skb)[64]) +void _des_crypt_keysetup (u32 c, u32 d, u32 Kc[16], u32 Kd[16], __local u32 (*s_skb)[64]) { - u32x tt; + u32 tt; PERM_OP (d, c, tt, 4, 0x0f0f0f0f); HPERM_OP (c, tt, 2, 0xcccc0000); @@ -423,13 +423,13 @@ void _des_crypt_keysetup (u32x c, u32x d, u32x Kc[16], u32x Kd[16], __local u32 c = c & 0x0fffffff; d = d & 0x0fffffff; - const u32x c00 = (c >> 0) & 0x0000003f; - const u32x c06 = (c >> 6) & 0x00383003; - const u32x c07 = (c >> 7) & 0x0000003c; - const u32x c13 = (c >> 13) & 0x0000060f; - const u32x c20 = (c >> 20) & 0x00000001; + const u32 c00 = (c >> 0) & 0x0000003f; + const u32 c06 = (c >> 6) & 0x00383003; + const u32 c07 = (c >> 7) & 0x0000003c; + const u32 c13 = (c >> 13) & 0x0000060f; + const u32 c20 = (c >> 20) & 0x00000001; - u32x s = BOX (((c00 >> 0) & 0xff), 0, s_skb) + u32 s = BOX (((c00 >> 0) & 0xff), 0, s_skb) | BOX (((c06 >> 0) & 0xff) |((c07 >> 0) & 0xff), 1, s_skb) | BOX (((c13 >> 0) & 0xff) @@ -438,12 +438,12 @@ void _des_crypt_keysetup (u32x c, u32x d, u32x Kc[16], u32x Kd[16], __local u32 |((c13 >> 8) & 0xff) |((c06 >> 16) & 0xff), 3, s_skb); - const u32x d00 = (d >> 0) & 0x00003c3f; - const u32x d07 = (d >> 7) & 0x00003f03; - const u32x d21 = (d >> 21) & 0x0000000f; - const u32x d22 = (d >> 22) & 0x00000030; + const u32 d00 = (d >> 0) & 0x00003c3f; + const u32 d07 = (d >> 7) & 0x00003f03; + const u32 d21 = (d >> 21) & 0x0000000f; + const u32 d22 = (d >> 22) & 0x00000030; - u32x t = BOX (((d00 >> 0) & 0xff), 4, s_skb) + u32 t = BOX (((d00 >> 0) & 0xff), 4, s_skb) | BOX (((d07 >> 0) & 0xff) |((d00 >> 8) & 0xff), 5, s_skb) | BOX (((d07 >> 8) & 0xff), 6, s_skb) @@ -455,9 +455,9 @@ void _des_crypt_keysetup (u32x c, u32x d, u32x Kc[16], u32x Kd[16], __local u32 } } -void transform_netntlmv1_key (const u32x w0, const u32x w1, u32x out[2]) +void transform_netntlmv1_key (const u32 w0, const u32 w1, u32 out[2]) { - u32x t[8]; + u32 t[8]; t[0] = (w0 >> 0) & 0xff; t[1] = (w0 >> 8) & 0xff; @@ -468,7 +468,7 @@ void transform_netntlmv1_key (const u32x w0, const u32x w1, u32x out[2]) t[6] = (w1 >> 16) & 0xff; t[7] = (w1 >> 24) & 0xff; - u32x k[8]; + u32 k[8]; k[0] = (t[0] >> 0); k[1] = (t[0] << 7) | (t[1] >> 1); diff --git a/OpenCL/m05500_a1.cl b/OpenCL/m05500_a1.cl index 34bc4711f..be64670a1 100644 --- a/OpenCL/m05500_a1.cl +++ b/OpenCL/m05500_a1.cl @@ -341,18 +341,18 @@ __constant u32a c_skb[8][64] = #define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf]) #endif -void _des_crypt_encrypt (u32x iv[2], u32x data[2], u32x Kc[16], u32x Kd[16], __local u32 (*s_SPtrans)[64]) +void _des_crypt_encrypt (u32 iv[2], u32 data[2], u32 Kc[16], u32 Kd[16], __local u32 (*s_SPtrans)[64]) { - u32x r = data[0]; - u32x l = data[1]; + u32 r = data[0]; + u32 l = data[1]; #ifdef _unroll #pragma unroll #endif for (u32 i = 0; i < 16; i += 2) { - u32x u; - u32x t; + u32 u; + u32 t; u = Kc[i + 0] ^ rotl32 (r, 30u); t = Kd[i + 0] ^ rotl32 (r, 26u); @@ -383,9 +383,9 @@ void _des_crypt_encrypt (u32x iv[2], u32x data[2], u32x Kc[16], u32x Kd[16], __l iv[1] = r; } -void _des_crypt_keysetup (u32x c, u32x d, u32x Kc[16], u32x Kd[16], __local u32 (*s_skb)[64]) +void _des_crypt_keysetup (u32 c, u32 d, u32 Kc[16], u32 Kd[16], __local u32 (*s_skb)[64]) { - u32x tt; + u32 tt; PERM_OP (d, c, tt, 4, 0x0f0f0f0f); HPERM_OP (c, tt, 2, 0xcccc0000); @@ -420,13 +420,13 @@ void _des_crypt_keysetup (u32x c, u32x d, u32x Kc[16], u32x Kd[16], __local u32 c = c & 0x0fffffff; d = d & 0x0fffffff; - const u32x c00 = (c >> 0) & 0x0000003f; - const u32x c06 = (c >> 6) & 0x00383003; - const u32x c07 = (c >> 7) & 0x0000003c; - const u32x c13 = (c >> 13) & 0x0000060f; - const u32x c20 = (c >> 20) & 0x00000001; + const u32 c00 = (c >> 0) & 0x0000003f; + const u32 c06 = (c >> 6) & 0x00383003; + const u32 c07 = (c >> 7) & 0x0000003c; + const u32 c13 = (c >> 13) & 0x0000060f; + const u32 c20 = (c >> 20) & 0x00000001; - u32x s = BOX (((c00 >> 0) & 0xff), 0, s_skb) + u32 s = BOX (((c00 >> 0) & 0xff), 0, s_skb) | BOX (((c06 >> 0) & 0xff) |((c07 >> 0) & 0xff), 1, s_skb) | BOX (((c13 >> 0) & 0xff) @@ -435,12 +435,12 @@ void _des_crypt_keysetup (u32x c, u32x d, u32x Kc[16], u32x Kd[16], __local u32 |((c13 >> 8) & 0xff) |((c06 >> 16) & 0xff), 3, s_skb); - const u32x d00 = (d >> 0) & 0x00003c3f; - const u32x d07 = (d >> 7) & 0x00003f03; - const u32x d21 = (d >> 21) & 0x0000000f; - const u32x d22 = (d >> 22) & 0x00000030; + const u32 d00 = (d >> 0) & 0x00003c3f; + const u32 d07 = (d >> 7) & 0x00003f03; + const u32 d21 = (d >> 21) & 0x0000000f; + const u32 d22 = (d >> 22) & 0x00000030; - u32x t = BOX (((d00 >> 0) & 0xff), 4, s_skb) + u32 t = BOX (((d00 >> 0) & 0xff), 4, s_skb) | BOX (((d07 >> 0) & 0xff) |((d00 >> 8) & 0xff), 5, s_skb) | BOX (((d07 >> 8) & 0xff), 6, s_skb) @@ -452,9 +452,9 @@ void _des_crypt_keysetup (u32x c, u32x d, u32x Kc[16], u32x Kd[16], __local u32 } } -void transform_netntlmv1_key (const u32x w0, const u32x w1, u32x out[2]) +void transform_netntlmv1_key (const u32 w0, const u32 w1, u32 out[2]) { - u32x t[8]; + u32 t[8]; t[0] = (w0 >> 0) & 0xff; t[1] = (w0 >> 8) & 0xff; @@ -465,7 +465,7 @@ void transform_netntlmv1_key (const u32x w0, const u32x w1, u32x out[2]) t[6] = (w1 >> 16) & 0xff; t[7] = (w1 >> 24) & 0xff; - u32x k[8]; + u32 k[8]; k[0] = (t[0] >> 0); k[1] = (t[0] << 7) | (t[1] >> 1); diff --git a/OpenCL/m10800_a0.cl b/OpenCL/m10800_a0.cl index 1ae43d030..5e0780118 100644 --- a/OpenCL/m10800_a0.cl +++ b/OpenCL/m10800_a0.cl @@ -59,10 +59,10 @@ __kernel void m10800_mxx (__global pw_t *pws, __global const kernel_rule_t *rule sha384_final (&ctx); - const u32x r0 = l32_from_64 (ctx.h[3]); - const u32x r1 = h32_from_64 (ctx.h[3]); - const u32x r2 = l32_from_64 (ctx.h[2]); - const u32x r3 = h32_from_64 (ctx.h[2]); + const u32 r0 = l32_from_64 (ctx.h[3]); + const u32 r1 = h32_from_64 (ctx.h[3]); + const u32 r2 = l32_from_64 (ctx.h[2]); + const u32 r3 = h32_from_64 (ctx.h[2]); COMPARE_M_SCALAR (r0, r1, r2, r3); } @@ -124,10 +124,10 @@ __kernel void m10800_sxx (__global pw_t *pws, __global const kernel_rule_t *rule sha384_final (&ctx); - const u32x r0 = l32_from_64 (ctx.h[3]); - const u32x r1 = h32_from_64 (ctx.h[3]); - const u32x r2 = l32_from_64 (ctx.h[2]); - const u32x r3 = h32_from_64 (ctx.h[2]); + const u32 r0 = l32_from_64 (ctx.h[3]); + const u32 r1 = h32_from_64 (ctx.h[3]); + const u32 r2 = l32_from_64 (ctx.h[2]); + const u32 r3 = h32_from_64 (ctx.h[2]); COMPARE_S_SCALAR (r0, r1, r2, r3); } diff --git a/OpenCL/m10800_a1.cl b/OpenCL/m10800_a1.cl index 9f1eded3a..a666a6019 100644 --- a/OpenCL/m10800_a1.cl +++ b/OpenCL/m10800_a1.cl @@ -46,10 +46,10 @@ __kernel void m10800_mxx (__global pw_t *pws, __global const kernel_rule_t *rule sha384_final (&ctx); - const u32x r0 = l32_from_64 (ctx.h[3]); - const u32x r1 = h32_from_64 (ctx.h[3]); - const u32x r2 = l32_from_64 (ctx.h[2]); - const u32x r3 = h32_from_64 (ctx.h[2]); + const u32 r0 = l32_from_64 (ctx.h[3]); + const u32 r1 = h32_from_64 (ctx.h[3]); + const u32 r2 = l32_from_64 (ctx.h[2]); + const u32 r3 = h32_from_64 (ctx.h[2]); COMPARE_M_SCALAR (r0, r1, r2, r3); } @@ -100,10 +100,10 @@ __kernel void m10800_sxx (__global pw_t *pws, __global const kernel_rule_t *rule sha384_final (&ctx); - const u32x r0 = l32_from_64 (ctx.h[3]); - const u32x r1 = h32_from_64 (ctx.h[3]); - const u32x r2 = l32_from_64 (ctx.h[2]); - const u32x r3 = h32_from_64 (ctx.h[2]); + const u32 r0 = l32_from_64 (ctx.h[3]); + const u32 r1 = h32_from_64 (ctx.h[3]); + const u32 r2 = l32_from_64 (ctx.h[2]); + const u32 r3 = h32_from_64 (ctx.h[2]); COMPARE_S_SCALAR (r0, r1, r2, r3); } From af6052d34b982575a917696ed00cd084f107ad46 Mon Sep 17 00:00:00 2001 From: jsteube Date: Tue, 1 Aug 2017 15:01:16 +0200 Subject: [PATCH 40/75] Revert some invalid rename of kernel files --- OpenCL/{m01500_a0-optimized.cl => m01500_a0.cl} | 0 OpenCL/{m01500_a1-optimized.cl => m01500_a1.cl} | 0 OpenCL/{m01500_a3-optimized.cl => m01500_a3.cl} | 0 OpenCL/{m02000_a0-optimized.cl => m02000_a0.cl} | 0 OpenCL/{m02000_a1-optimized.cl => m02000_a1.cl} | 0 OpenCL/{m02000_a3-optimized.cl => m02000_a3.cl} | 0 OpenCL/{m03000_a0-optimized.cl => m03000_a0.cl} | 0 OpenCL/{m03000_a1-optimized.cl => m03000_a1.cl} | 0 OpenCL/{m03000_a3-optimized.cl => m03000_a3.cl} | 0 9 files changed, 0 insertions(+), 0 deletions(-) rename OpenCL/{m01500_a0-optimized.cl => m01500_a0.cl} (100%) rename OpenCL/{m01500_a1-optimized.cl => m01500_a1.cl} (100%) rename OpenCL/{m01500_a3-optimized.cl => m01500_a3.cl} (100%) rename OpenCL/{m02000_a0-optimized.cl => m02000_a0.cl} (100%) rename OpenCL/{m02000_a1-optimized.cl => m02000_a1.cl} (100%) rename OpenCL/{m02000_a3-optimized.cl => m02000_a3.cl} (100%) rename OpenCL/{m03000_a0-optimized.cl => m03000_a0.cl} (100%) rename OpenCL/{m03000_a1-optimized.cl => m03000_a1.cl} (100%) rename OpenCL/{m03000_a3-optimized.cl => m03000_a3.cl} (100%) diff --git a/OpenCL/m01500_a0-optimized.cl b/OpenCL/m01500_a0.cl similarity index 100% rename from OpenCL/m01500_a0-optimized.cl rename to OpenCL/m01500_a0.cl diff --git a/OpenCL/m01500_a1-optimized.cl b/OpenCL/m01500_a1.cl similarity index 100% rename from OpenCL/m01500_a1-optimized.cl rename to OpenCL/m01500_a1.cl diff --git a/OpenCL/m01500_a3-optimized.cl b/OpenCL/m01500_a3.cl similarity index 100% rename from OpenCL/m01500_a3-optimized.cl rename to OpenCL/m01500_a3.cl diff --git a/OpenCL/m02000_a0-optimized.cl b/OpenCL/m02000_a0.cl similarity index 100% rename from OpenCL/m02000_a0-optimized.cl rename to OpenCL/m02000_a0.cl diff --git a/OpenCL/m02000_a1-optimized.cl b/OpenCL/m02000_a1.cl similarity index 100% rename from OpenCL/m02000_a1-optimized.cl rename to OpenCL/m02000_a1.cl diff --git a/OpenCL/m02000_a3-optimized.cl b/OpenCL/m02000_a3.cl similarity index 100% rename from OpenCL/m02000_a3-optimized.cl rename to OpenCL/m02000_a3.cl diff --git a/OpenCL/m03000_a0-optimized.cl b/OpenCL/m03000_a0.cl similarity index 100% rename from OpenCL/m03000_a0-optimized.cl rename to OpenCL/m03000_a0.cl diff --git a/OpenCL/m03000_a1-optimized.cl b/OpenCL/m03000_a1.cl similarity index 100% rename from OpenCL/m03000_a1-optimized.cl rename to OpenCL/m03000_a1.cl diff --git a/OpenCL/m03000_a3-optimized.cl b/OpenCL/m03000_a3.cl similarity index 100% rename from OpenCL/m03000_a3-optimized.cl rename to OpenCL/m03000_a3.cl From 89d52f82098063e3d6c274ab68abdc02763c118c Mon Sep 17 00:00:00 2001 From: jsteube Date: Tue, 1 Aug 2017 17:39:12 +0200 Subject: [PATCH 41/75] Add pure kernels for Kerberos 5 AS-REQ Pre-Auth etype 23 --- OpenCL/m07500_a0.cl | 425 ++++++++++++++++++++++++++++++++++++++++ OpenCL/m07500_a1.cl | 401 ++++++++++++++++++++++++++++++++++++++ OpenCL/m07500_a3.cl | 459 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1285 insertions(+) create mode 100644 OpenCL/m07500_a0.cl create mode 100644 OpenCL/m07500_a1.cl create mode 100644 OpenCL/m07500_a3.cl diff --git a/OpenCL/m07500_a0.cl b/OpenCL/m07500_a0.cl new file mode 100644 index 000000000..e38afb3dd --- /dev/null +++ b/OpenCL/m07500_a0.cl @@ -0,0 +1,425 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//shared mem too small +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_hash_md4.cl" +#include "inc_hash_md5.cl" + +typedef struct +{ + u8 S[256]; + + u32 wtf_its_faster; + +} RC4_KEY; + +void swap (__local RC4_KEY *rc4_key, const u8 i, const u8 j) +{ + u8 tmp; + + tmp = rc4_key->S[i]; + rc4_key->S[i] = rc4_key->S[j]; + rc4_key->S[j] = tmp; +} + +void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4]) +{ + u32 v = 0x03020100; + u32 a = 0x04040404; + + __local u32 *ptr = (__local u32 *) rc4_key->S; + + #ifdef _unroll + #pragma unroll + #endif + for (u32 i = 0; i < 64; i++) + { + *ptr++ = v; v += a; + } + + u32 j = 0; + + for (u32 i = 0; i < 16; i++) + { + u32 idx = i * 16; + + u32 v; + + v = data[0]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[1]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[2]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[3]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + } +} + +u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4]) +{ + #ifdef _unroll + #pragma unroll + #endif + for (u32 k = 0; k < 4; k++) + { + u32 xor4 = 0; + + u8 idx; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 0; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 8; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 16; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 24; + + out[k] = in[k] ^ xor4; + } + + return j; +} + +int decrypt_and_check (__local RC4_KEY *rc4_key, u32 data[4], u32 timestamp_ct[8]) +{ + rc4_init_16 (rc4_key, data); + + u32 out[4]; + + u8 j = 0; + + j = rc4_next_16 (rc4_key, 0, j, timestamp_ct + 0, out); + + if ((out[3] & 0xffff0000) != 0x30320000) return 0; + + j = rc4_next_16 (rc4_key, 16, j, timestamp_ct + 4, out); + + if (((out[0] & 0xff) < '0') || ((out[0] & 0xff) > '9')) return 0; out[0] >>= 8; + if (((out[0] & 0xff) < '0') || ((out[0] & 0xff) > '9')) return 0; out[0] >>= 8; + if (((out[0] & 0xff) < '0') || ((out[0] & 0xff) > '9')) return 0; out[0] >>= 8; + if (((out[0] & 0xff) < '0') || ((out[0] & 0xff) > '9')) return 0; + if (((out[1] & 0xff) < '0') || ((out[1] & 0xff) > '9')) return 0; out[1] >>= 8; + if (((out[1] & 0xff) < '0') || ((out[1] & 0xff) > '9')) return 0; out[1] >>= 8; + if (((out[1] & 0xff) < '0') || ((out[1] & 0xff) > '9')) return 0; out[1] >>= 8; + if (((out[1] & 0xff) < '0') || ((out[1] & 0xff) > '9')) return 0; + if (((out[2] & 0xff) < '0') || ((out[2] & 0xff) > '9')) return 0; out[2] >>= 8; + if (((out[2] & 0xff) < '0') || ((out[2] & 0xff) > '9')) return 0; out[2] >>= 8; + if (((out[2] & 0xff) < '0') || ((out[2] & 0xff) > '9')) return 0; out[2] >>= 8; + if (((out[2] & 0xff) < '0') || ((out[2] & 0xff) > '9')) return 0; + + return 1; +} + +void kerb_prepare (const u32 K[4], const u32 checksum[4], u32 digest[4]) +{ + // K1=MD5_HMAC(K,1); with 1 encoded as little indian on 4 bytes (01000000 in hexa); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = K[0]; + w0[1] = K[1]; + w0[2] = K[2]; + w0[3] = K[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx1; + + md5_hmac_init_64 (&ctx1, w0, w1, w2, w3); + + w0[0] = 1; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_update_64 (&ctx1, w0, w1, w2, w3, 4); + + md5_hmac_final (&ctx1); + + w0[0] = ctx1.opad.h[0]; + w0[1] = ctx1.opad.h[1]; + w0[2] = ctx1.opad.h[2]; + w0[3] = ctx1.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + w0[0] = checksum[0]; + w0[1] = checksum[1]; + w0[2] = checksum[2]; + w0[3] = checksum[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_update_64 (&ctx, w0, w1, w2, w3, 16); + + md5_hmac_final (&ctx); + + digest[0] = ctx.opad.h[0]; + digest[1] = ctx.opad.h[1]; + digest[2] = ctx.opad.h[2]; + digest[3] = ctx.opad.h[3]; +} + +__kernel void m07500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const krb5pa_t *krb5pa_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + __local RC4_KEY rc4_keys[64]; + + u32 checksum[4]; + + checksum[0] = krb5pa_bufs[digests_offset].checksum[0]; + checksum[1] = krb5pa_bufs[digests_offset].checksum[1]; + checksum[2] = krb5pa_bufs[digests_offset].checksum[2]; + checksum[3] = krb5pa_bufs[digests_offset].checksum[3]; + + u32 timestamp_ct[8]; + + timestamp_ct[0] = krb5pa_bufs[digests_offset].timestamp[0]; + timestamp_ct[1] = krb5pa_bufs[digests_offset].timestamp[1]; + timestamp_ct[2] = krb5pa_bufs[digests_offset].timestamp[2]; + timestamp_ct[3] = krb5pa_bufs[digests_offset].timestamp[3]; + timestamp_ct[4] = krb5pa_bufs[digests_offset].timestamp[4]; + timestamp_ct[5] = krb5pa_bufs[digests_offset].timestamp[5]; + timestamp_ct[6] = krb5pa_bufs[digests_offset].timestamp[6]; + timestamp_ct[7] = krb5pa_bufs[digests_offset].timestamp[7]; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md4_ctx_t ctx; + + md4_init (&ctx); + + md4_update_utf16le (&ctx, w, pw_len); + + md4_final (&ctx); + + u32 digest[4]; + + kerb_prepare (ctx.h, checksum, digest); + + if (decrypt_and_check (&rc4_keys[lid], digest, timestamp_ct) == 1) + { + if (atomic_inc (&hashes_shown[digests_offset]) == 0) + { + mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset + 0, gid, il_pos); + } + } + } +} + +__kernel void m07500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const krb5pa_t *krb5pa_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + __local RC4_KEY rc4_keys[64]; + + u32 checksum[4]; + + checksum[0] = krb5pa_bufs[digests_offset].checksum[0]; + checksum[1] = krb5pa_bufs[digests_offset].checksum[1]; + checksum[2] = krb5pa_bufs[digests_offset].checksum[2]; + checksum[3] = krb5pa_bufs[digests_offset].checksum[3]; + + u32 timestamp_ct[8]; + + timestamp_ct[0] = krb5pa_bufs[digests_offset].timestamp[0]; + timestamp_ct[1] = krb5pa_bufs[digests_offset].timestamp[1]; + timestamp_ct[2] = krb5pa_bufs[digests_offset].timestamp[2]; + timestamp_ct[3] = krb5pa_bufs[digests_offset].timestamp[3]; + timestamp_ct[4] = krb5pa_bufs[digests_offset].timestamp[4]; + timestamp_ct[5] = krb5pa_bufs[digests_offset].timestamp[5]; + timestamp_ct[6] = krb5pa_bufs[digests_offset].timestamp[6]; + timestamp_ct[7] = krb5pa_bufs[digests_offset].timestamp[7]; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md4_ctx_t ctx; + + md4_init (&ctx); + + md4_update_utf16le (&ctx, w, pw_len); + + md4_final (&ctx); + + u32 digest[4]; + + kerb_prepare (ctx.h, checksum, digest); + + if (decrypt_and_check (&rc4_keys[lid], digest, timestamp_ct) == 1) + { + if (atomic_inc (&hashes_shown[digests_offset]) == 0) + { + mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset + 0, gid, il_pos); + } + } + } +} diff --git a/OpenCL/m07500_a1.cl b/OpenCL/m07500_a1.cl new file mode 100644 index 000000000..0942bb921 --- /dev/null +++ b/OpenCL/m07500_a1.cl @@ -0,0 +1,401 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//shared mem too small +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_hash_md4.cl" +#include "inc_hash_md5.cl" + +typedef struct +{ + u8 S[256]; + + u32 wtf_its_faster; + +} RC4_KEY; + +void swap (__local RC4_KEY *rc4_key, const u8 i, const u8 j) +{ + u8 tmp; + + tmp = rc4_key->S[i]; + rc4_key->S[i] = rc4_key->S[j]; + rc4_key->S[j] = tmp; +} + +void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4]) +{ + u32 v = 0x03020100; + u32 a = 0x04040404; + + __local u32 *ptr = (__local u32 *) rc4_key->S; + + #ifdef _unroll + #pragma unroll + #endif + for (u32 i = 0; i < 64; i++) + { + *ptr++ = v; v += a; + } + + u32 j = 0; + + for (u32 i = 0; i < 16; i++) + { + u32 idx = i * 16; + + u32 v; + + v = data[0]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[1]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[2]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[3]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + } +} + +u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4]) +{ + #ifdef _unroll + #pragma unroll + #endif + for (u32 k = 0; k < 4; k++) + { + u32 xor4 = 0; + + u8 idx; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 0; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 8; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 16; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 24; + + out[k] = in[k] ^ xor4; + } + + return j; +} + +int decrypt_and_check (__local RC4_KEY *rc4_key, u32 data[4], u32 timestamp_ct[8]) +{ + rc4_init_16 (rc4_key, data); + + u32 out[4]; + + u8 j = 0; + + j = rc4_next_16 (rc4_key, 0, j, timestamp_ct + 0, out); + + if ((out[3] & 0xffff0000) != 0x30320000) return 0; + + j = rc4_next_16 (rc4_key, 16, j, timestamp_ct + 4, out); + + if (((out[0] & 0xff) < '0') || ((out[0] & 0xff) > '9')) return 0; out[0] >>= 8; + if (((out[0] & 0xff) < '0') || ((out[0] & 0xff) > '9')) return 0; out[0] >>= 8; + if (((out[0] & 0xff) < '0') || ((out[0] & 0xff) > '9')) return 0; out[0] >>= 8; + if (((out[0] & 0xff) < '0') || ((out[0] & 0xff) > '9')) return 0; + if (((out[1] & 0xff) < '0') || ((out[1] & 0xff) > '9')) return 0; out[1] >>= 8; + if (((out[1] & 0xff) < '0') || ((out[1] & 0xff) > '9')) return 0; out[1] >>= 8; + if (((out[1] & 0xff) < '0') || ((out[1] & 0xff) > '9')) return 0; out[1] >>= 8; + if (((out[1] & 0xff) < '0') || ((out[1] & 0xff) > '9')) return 0; + if (((out[2] & 0xff) < '0') || ((out[2] & 0xff) > '9')) return 0; out[2] >>= 8; + if (((out[2] & 0xff) < '0') || ((out[2] & 0xff) > '9')) return 0; out[2] >>= 8; + if (((out[2] & 0xff) < '0') || ((out[2] & 0xff) > '9')) return 0; out[2] >>= 8; + if (((out[2] & 0xff) < '0') || ((out[2] & 0xff) > '9')) return 0; + + return 1; +} + +void kerb_prepare (const u32 K[4], const u32 checksum[4], u32 digest[4]) +{ + // K1=MD5_HMAC(K,1); with 1 encoded as little indian on 4 bytes (01000000 in hexa); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = K[0]; + w0[1] = K[1]; + w0[2] = K[2]; + w0[3] = K[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx1; + + md5_hmac_init_64 (&ctx1, w0, w1, w2, w3); + + w0[0] = 1; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_update_64 (&ctx1, w0, w1, w2, w3, 4); + + md5_hmac_final (&ctx1); + + w0[0] = ctx1.opad.h[0]; + w0[1] = ctx1.opad.h[1]; + w0[2] = ctx1.opad.h[2]; + w0[3] = ctx1.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + w0[0] = checksum[0]; + w0[1] = checksum[1]; + w0[2] = checksum[2]; + w0[3] = checksum[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_update_64 (&ctx, w0, w1, w2, w3, 16); + + md5_hmac_final (&ctx); + + digest[0] = ctx.opad.h[0]; + digest[1] = ctx.opad.h[1]; + digest[2] = ctx.opad.h[2]; + digest[3] = ctx.opad.h[3]; +} + +__kernel void m07500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const krb5pa_t *krb5pa_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + __local RC4_KEY rc4_keys[64]; + + u32 checksum[4]; + + checksum[0] = krb5pa_bufs[digests_offset].checksum[0]; + checksum[1] = krb5pa_bufs[digests_offset].checksum[1]; + checksum[2] = krb5pa_bufs[digests_offset].checksum[2]; + checksum[3] = krb5pa_bufs[digests_offset].checksum[3]; + + u32 timestamp_ct[8]; + + timestamp_ct[0] = krb5pa_bufs[digests_offset].timestamp[0]; + timestamp_ct[1] = krb5pa_bufs[digests_offset].timestamp[1]; + timestamp_ct[2] = krb5pa_bufs[digests_offset].timestamp[2]; + timestamp_ct[3] = krb5pa_bufs[digests_offset].timestamp[3]; + timestamp_ct[4] = krb5pa_bufs[digests_offset].timestamp[4]; + timestamp_ct[5] = krb5pa_bufs[digests_offset].timestamp[5]; + timestamp_ct[6] = krb5pa_bufs[digests_offset].timestamp[6]; + timestamp_ct[7] = krb5pa_bufs[digests_offset].timestamp[7]; + + md4_ctx_t ctx0; + + md4_init (&ctx0); + + md4_update_global_utf16le (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md4_ctx_t ctx = ctx0; + + md4_update_global_utf16le (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md4_final (&ctx); + + u32 digest[4]; + + kerb_prepare (ctx.h, checksum, digest); + + if (decrypt_and_check (&rc4_keys[lid], digest, timestamp_ct) == 1) + { + if (atomic_inc (&hashes_shown[digests_offset]) == 0) + { + mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset + 0, gid, il_pos); + } + } + } +} + +__kernel void m07500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const krb5pa_t *krb5pa_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + __local RC4_KEY rc4_keys[64]; + + u32 checksum[4]; + + checksum[0] = krb5pa_bufs[digests_offset].checksum[0]; + checksum[1] = krb5pa_bufs[digests_offset].checksum[1]; + checksum[2] = krb5pa_bufs[digests_offset].checksum[2]; + checksum[3] = krb5pa_bufs[digests_offset].checksum[3]; + + u32 timestamp_ct[8]; + + timestamp_ct[0] = krb5pa_bufs[digests_offset].timestamp[0]; + timestamp_ct[1] = krb5pa_bufs[digests_offset].timestamp[1]; + timestamp_ct[2] = krb5pa_bufs[digests_offset].timestamp[2]; + timestamp_ct[3] = krb5pa_bufs[digests_offset].timestamp[3]; + timestamp_ct[4] = krb5pa_bufs[digests_offset].timestamp[4]; + timestamp_ct[5] = krb5pa_bufs[digests_offset].timestamp[5]; + timestamp_ct[6] = krb5pa_bufs[digests_offset].timestamp[6]; + timestamp_ct[7] = krb5pa_bufs[digests_offset].timestamp[7]; + + md4_ctx_t ctx0; + + md4_init (&ctx0); + + md4_update_global_utf16le (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md4_ctx_t ctx = ctx0; + + md4_update_global_utf16le (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md4_final (&ctx); + + u32 digest[4]; + + kerb_prepare (ctx.h, checksum, digest); + + if (decrypt_and_check (&rc4_keys[lid], digest, timestamp_ct) == 1) + { + if (atomic_inc (&hashes_shown[digests_offset]) == 0) + { + mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset + 0, gid, il_pos); + } + } + } +} diff --git a/OpenCL/m07500_a3.cl b/OpenCL/m07500_a3.cl new file mode 100644 index 000000000..2afee5d50 --- /dev/null +++ b/OpenCL/m07500_a3.cl @@ -0,0 +1,459 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//shared mem too small +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_hash_md4.cl" +#include "inc_hash_md5.cl" + +typedef struct +{ + u8 S[256]; + + u32 wtf_its_faster; + +} RC4_KEY; + +void swap (__local RC4_KEY *rc4_key, const u8 i, const u8 j) +{ + u8 tmp; + + tmp = rc4_key->S[i]; + rc4_key->S[i] = rc4_key->S[j]; + rc4_key->S[j] = tmp; +} + +void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4]) +{ + u32 v = 0x03020100; + u32 a = 0x04040404; + + __local u32 *ptr = (__local u32 *) rc4_key->S; + + #ifdef _unroll + #pragma unroll + #endif + for (u32 i = 0; i < 64; i++) + { + *ptr++ = v; v += a; + } + + u32 j = 0; + + for (u32 i = 0; i < 16; i++) + { + u32 idx = i * 16; + + u32 v; + + v = data[0]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[1]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[2]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[3]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + } +} + +u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4]) +{ + #ifdef _unroll + #pragma unroll + #endif + for (u32 k = 0; k < 4; k++) + { + u32 xor4 = 0; + + u8 idx; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 0; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 8; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 16; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 24; + + out[k] = in[k] ^ xor4; + } + + return j; +} + +int decrypt_and_check (__local RC4_KEY *rc4_key, u32 data[4], u32 timestamp_ct[8]) +{ + rc4_init_16 (rc4_key, data); + + u32 out[4]; + + u8 j = 0; + + j = rc4_next_16 (rc4_key, 0, j, timestamp_ct + 0, out); + + if ((out[3] & 0xffff0000) != 0x30320000) return 0; + + j = rc4_next_16 (rc4_key, 16, j, timestamp_ct + 4, out); + + if (((out[0] & 0xff) < '0') || ((out[0] & 0xff) > '9')) return 0; out[0] >>= 8; + if (((out[0] & 0xff) < '0') || ((out[0] & 0xff) > '9')) return 0; out[0] >>= 8; + if (((out[0] & 0xff) < '0') || ((out[0] & 0xff) > '9')) return 0; out[0] >>= 8; + if (((out[0] & 0xff) < '0') || ((out[0] & 0xff) > '9')) return 0; + if (((out[1] & 0xff) < '0') || ((out[1] & 0xff) > '9')) return 0; out[1] >>= 8; + if (((out[1] & 0xff) < '0') || ((out[1] & 0xff) > '9')) return 0; out[1] >>= 8; + if (((out[1] & 0xff) < '0') || ((out[1] & 0xff) > '9')) return 0; out[1] >>= 8; + if (((out[1] & 0xff) < '0') || ((out[1] & 0xff) > '9')) return 0; + if (((out[2] & 0xff) < '0') || ((out[2] & 0xff) > '9')) return 0; out[2] >>= 8; + if (((out[2] & 0xff) < '0') || ((out[2] & 0xff) > '9')) return 0; out[2] >>= 8; + if (((out[2] & 0xff) < '0') || ((out[2] & 0xff) > '9')) return 0; out[2] >>= 8; + if (((out[2] & 0xff) < '0') || ((out[2] & 0xff) > '9')) return 0; + + return 1; +} + +void kerb_prepare (const u32 K[4], const u32 checksum[4], u32 digest[4]) +{ + // K1=MD5_HMAC(K,1); with 1 encoded as little indian on 4 bytes (01000000 in hexa); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = K[0]; + w0[1] = K[1]; + w0[2] = K[2]; + w0[3] = K[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx1; + + md5_hmac_init_64 (&ctx1, w0, w1, w2, w3); + + w0[0] = 1; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_update_64 (&ctx1, w0, w1, w2, w3, 4); + + md5_hmac_final (&ctx1); + + w0[0] = ctx1.opad.h[0]; + w0[1] = ctx1.opad.h[1]; + w0[2] = ctx1.opad.h[2]; + w0[3] = ctx1.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + w0[0] = checksum[0]; + w0[1] = checksum[1]; + w0[2] = checksum[2]; + w0[3] = checksum[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_update_64 (&ctx, w0, w1, w2, w3, 16); + + md5_hmac_final (&ctx); + + digest[0] = ctx.opad.h[0]; + digest[1] = ctx.opad.h[1]; + digest[2] = ctx.opad.h[2]; + digest[3] = ctx.opad.h[3]; +} + +__kernel void m07500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const krb5pa_t *krb5pa_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + __local RC4_KEY rc4_keys[64]; + + u32 checksum[4]; + + checksum[0] = krb5pa_bufs[digests_offset].checksum[0]; + checksum[1] = krb5pa_bufs[digests_offset].checksum[1]; + checksum[2] = krb5pa_bufs[digests_offset].checksum[2]; + checksum[3] = krb5pa_bufs[digests_offset].checksum[3]; + + u32 timestamp_ct[8]; + + timestamp_ct[0] = krb5pa_bufs[digests_offset].timestamp[0]; + timestamp_ct[1] = krb5pa_bufs[digests_offset].timestamp[1]; + timestamp_ct[2] = krb5pa_bufs[digests_offset].timestamp[2]; + timestamp_ct[3] = krb5pa_bufs[digests_offset].timestamp[3]; + timestamp_ct[4] = krb5pa_bufs[digests_offset].timestamp[4]; + timestamp_ct[5] = krb5pa_bufs[digests_offset].timestamp[5]; + timestamp_ct[6] = krb5pa_bufs[digests_offset].timestamp[6]; + timestamp_ct[7] = krb5pa_bufs[digests_offset].timestamp[7]; + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md4_ctx_t ctx; + + md4_init (&ctx); + + md4_update_utf16le (&ctx, w, pw_len); + + md4_final (&ctx); + + u32 digest[4]; + + kerb_prepare (ctx.h, checksum, digest); + + if (decrypt_and_check (&rc4_keys[lid], digest, timestamp_ct) == 1) + { + if (atomic_inc (&hashes_shown[digests_offset]) == 0) + { + mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset + 0, gid, il_pos); + } + } + } +} + +__kernel void m07500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const krb5pa_t *krb5pa_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + __local RC4_KEY rc4_keys[64]; + + u32 checksum[4]; + + checksum[0] = krb5pa_bufs[digests_offset].checksum[0]; + checksum[1] = krb5pa_bufs[digests_offset].checksum[1]; + checksum[2] = krb5pa_bufs[digests_offset].checksum[2]; + checksum[3] = krb5pa_bufs[digests_offset].checksum[3]; + + u32 timestamp_ct[8]; + + timestamp_ct[0] = krb5pa_bufs[digests_offset].timestamp[0]; + timestamp_ct[1] = krb5pa_bufs[digests_offset].timestamp[1]; + timestamp_ct[2] = krb5pa_bufs[digests_offset].timestamp[2]; + timestamp_ct[3] = krb5pa_bufs[digests_offset].timestamp[3]; + timestamp_ct[4] = krb5pa_bufs[digests_offset].timestamp[4]; + timestamp_ct[5] = krb5pa_bufs[digests_offset].timestamp[5]; + timestamp_ct[6] = krb5pa_bufs[digests_offset].timestamp[6]; + timestamp_ct[7] = krb5pa_bufs[digests_offset].timestamp[7]; + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md4_ctx_t ctx; + + md4_init (&ctx); + + md4_update_utf16le (&ctx, w, pw_len); + + md4_final (&ctx); + + u32 digest[4]; + + kerb_prepare (ctx.h, checksum, digest); + + if (decrypt_and_check (&rc4_keys[lid], digest, timestamp_ct) == 1) + { + if (atomic_inc (&hashes_shown[digests_offset]) == 0) + { + mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset + 0, gid, il_pos); + } + } + } +} From bc6b8ca1c9dbda9dfaa82653d6c725a2b3fba2e9 Mon Sep 17 00:00:00 2001 From: jsteube Date: Wed, 2 Aug 2017 13:14:01 +0200 Subject: [PATCH 42/75] Mix in pure kernel functions to SAP CODVN B (BCODE) --- OpenCL/m07700_a0-optimized.cl | 351 ++++---------------------------- OpenCL/m07700_a1-optimized.cl | 351 ++++---------------------------- OpenCL/m07700_a3-optimized.cl | 371 +++++----------------------------- 3 files changed, 121 insertions(+), 952 deletions(-) diff --git a/OpenCL/m07700_a0-optimized.cl b/OpenCL/m07700_a0-optimized.cl index 2fa57cdbc..8a6b88cf1 100644 --- a/OpenCL/m07700_a0-optimized.cl +++ b/OpenCL/m07700_a0-optimized.cl @@ -14,6 +14,7 @@ #include "inc_rp.h" #include "inc_rp.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" #define GETCHAR(a,p) (((a)[(p) / 4] >> (((p) & 3) * 8)) & 0xff) #define PUTCHAR(a,p,c) ((a)[(p) / 4] = (((a)[(p) / 4] & ~(0xff << (((p) & 3) * 8))) | ((c) << (((p) & 3) * 8)))) @@ -335,174 +336,35 @@ __kernel void m07700_m04 (__global pw_t *pws, __global const kernel_rule_t *rule * md5 */ - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32 digest[4]; - MD5_STEP (MD5_Fo, a, b, c, d, t[ 0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 4], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 5], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 6], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 7], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 8], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 9], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[10], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[11], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[12], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[13], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[14], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[15], MD5C0f, MD5S03); + digest[0] = MD5M_A; + digest[1] = MD5M_B; + digest[2] = MD5M_C; + digest[3] = MD5M_D; - MD5_STEP (MD5_Go, a, b, c, d, t[ 1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 6], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[11], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 5], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[10], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[15], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 4], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 9], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[14], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 8], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[13], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 7], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[12], MD5C1f, MD5S13); + md5_transform (t + 0, t + 4, t + 8, t + 12, digest); - MD5_STEP (MD5_H , a, b, c, d, t[ 5], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 8], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[11], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[14], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 4], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 7], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[10], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[13], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 6], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 9], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[12], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[15], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, t[ 0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 7], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[14], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 5], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[12], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[10], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 8], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[15], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 6], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[13], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 4], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[11], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 9], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - const u32 sum20 = walld0rf_magic (w0, out_len, salt_buf0, salt_len, a, b, c, d, t); + const u32 sum20 = walld0rf_magic (w0, pw_len, salt_buf0, salt_len, digest[0], digest[1], digest[2], digest[3], t); append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, sum20); t[14] = sum20 * 8; + t[15] = 0; - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; + digest[0] = MD5M_A; + digest[1] = MD5M_B; + digest[2] = MD5M_C; + digest[3] = MD5M_D; - MD5_STEP (MD5_Fo, a, b, c, d, t[ 0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 4], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 5], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 6], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 7], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 8], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 9], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[10], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[11], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[12], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[13], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[14], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[15], MD5C0f, MD5S03); + md5_transform (t + 0, t + 4, t + 8, t + 12, digest); - MD5_STEP (MD5_Go, a, b, c, d, t[ 1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 6], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[11], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 5], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[10], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[15], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 4], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 9], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[14], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 8], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[13], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 7], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[12], MD5C1f, MD5S13); + const u32 r0 = digest[0] ^ digest[2]; + const u32 r1 = digest[1] ^ digest[3]; + const u32 r2 = 0; + const u32 r3 = 0; - MD5_STEP (MD5_H , a, b, c, d, t[ 5], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 8], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[11], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[14], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 4], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 7], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[10], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[13], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 6], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 9], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[12], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[15], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, t[ 0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 7], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[14], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 5], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[12], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[10], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 8], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[15], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 6], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[13], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 4], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[11], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 9], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - a ^= c; - b ^= d; - c = 0; - d = 0; - - COMPARE_M_SIMD (a, b, c, d); + COMPARE_M_SIMD (r0, r1, r2, r3); } } @@ -650,174 +512,35 @@ __kernel void m07700_s04 (__global pw_t *pws, __global const kernel_rule_t *rule * md5 */ - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32 digest[4]; - MD5_STEP (MD5_Fo, a, b, c, d, t[ 0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 4], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 5], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 6], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 7], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 8], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 9], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[10], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[11], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[12], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[13], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[14], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[15], MD5C0f, MD5S03); + digest[0] = MD5M_A; + digest[1] = MD5M_B; + digest[2] = MD5M_C; + digest[3] = MD5M_D; - MD5_STEP (MD5_Go, a, b, c, d, t[ 1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 6], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[11], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 5], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[10], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[15], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 4], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 9], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[14], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 8], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[13], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 7], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[12], MD5C1f, MD5S13); + md5_transform (t + 0, t + 4, t + 8, t + 12, digest); - MD5_STEP (MD5_H , a, b, c, d, t[ 5], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 8], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[11], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[14], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 4], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 7], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[10], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[13], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 6], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 9], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[12], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[15], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, t[ 0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 7], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[14], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 5], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[12], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[10], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 8], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[15], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 6], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[13], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 4], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[11], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 9], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - const u32 sum20 = walld0rf_magic (w0, out_len, salt_buf0, salt_len, a, b, c, d, t); + const u32 sum20 = walld0rf_magic (w0, pw_len, salt_buf0, salt_len, digest[0], digest[1], digest[2], digest[3], t); append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, sum20); t[14] = sum20 * 8; + t[15] = 0; - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; + digest[0] = MD5M_A; + digest[1] = MD5M_B; + digest[2] = MD5M_C; + digest[3] = MD5M_D; - MD5_STEP (MD5_Fo, a, b, c, d, t[ 0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 4], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 5], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 6], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 7], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 8], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 9], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[10], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[11], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[12], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[13], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[14], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[15], MD5C0f, MD5S03); + md5_transform (t + 0, t + 4, t + 8, t + 12, digest); - MD5_STEP (MD5_Go, a, b, c, d, t[ 1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 6], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[11], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 5], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[10], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[15], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 4], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 9], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[14], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 8], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[13], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 7], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[12], MD5C1f, MD5S13); + const u32 r0 = digest[0] ^ digest[2]; + const u32 r1 = digest[1] ^ digest[3]; + const u32 r2 = 0; + const u32 r3 = 0; - MD5_STEP (MD5_H , a, b, c, d, t[ 5], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 8], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[11], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[14], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 4], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 7], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[10], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[13], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 6], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 9], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[12], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[15], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, t[ 0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 7], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[14], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 5], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[12], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[10], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 8], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[15], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 6], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[13], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 4], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[11], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 9], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - a ^= c; - b ^= d; - c = 0; - d = 0; - - COMPARE_S_SIMD (a, b, c, d); + COMPARE_S_SIMD (r0, r1, r2, r3); } } diff --git a/OpenCL/m07700_a1-optimized.cl b/OpenCL/m07700_a1-optimized.cl index cdc24160a..d2d0350c7 100644 --- a/OpenCL/m07700_a1-optimized.cl +++ b/OpenCL/m07700_a1-optimized.cl @@ -12,6 +12,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" #define GETCHAR(a,p) (((a)[(p) / 4] >> (((p) & 3) * 8)) & 0xff) #define PUTCHAR(a,p,c) ((a)[(p) / 4] = (((a)[(p) / 4] & ~(0xff << (((p) & 3) * 8))) | ((c) << (((p) & 3) * 8)))) @@ -376,174 +377,35 @@ __kernel void m07700_m04 (__global pw_t *pws, __global const kernel_rule_t *rule * md5 */ - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32 digest[4]; - MD5_STEP (MD5_Fo, a, b, c, d, t[ 0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 4], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 5], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 6], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 7], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 8], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 9], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[10], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[11], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[12], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[13], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[14], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[15], MD5C0f, MD5S03); + digest[0] = MD5M_A; + digest[1] = MD5M_B; + digest[2] = MD5M_C; + digest[3] = MD5M_D; - MD5_STEP (MD5_Go, a, b, c, d, t[ 1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 6], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[11], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 5], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[10], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[15], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 4], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 9], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[14], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 8], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[13], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 7], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[12], MD5C1f, MD5S13); + md5_transform (t + 0, t + 4, t + 8, t + 12, digest); - MD5_STEP (MD5_H , a, b, c, d, t[ 5], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 8], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[11], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[14], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 4], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 7], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[10], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[13], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 6], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 9], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[12], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[15], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, t[ 0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 7], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[14], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 5], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[12], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[10], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 8], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[15], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 6], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[13], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 4], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[11], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 9], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - const u32 sum20 = walld0rf_magic (w0, pw_len, salt_buf0, salt_len, a, b, c, d, t); + const u32 sum20 = walld0rf_magic (w0, pw_len, salt_buf0, salt_len, digest[0], digest[1], digest[2], digest[3], t); append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, sum20); t[14] = sum20 * 8; + t[15] = 0; - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; + digest[0] = MD5M_A; + digest[1] = MD5M_B; + digest[2] = MD5M_C; + digest[3] = MD5M_D; - MD5_STEP (MD5_Fo, a, b, c, d, t[ 0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 4], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 5], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 6], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 7], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 8], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 9], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[10], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[11], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[12], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[13], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[14], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[15], MD5C0f, MD5S03); + md5_transform (t + 0, t + 4, t + 8, t + 12, digest); - MD5_STEP (MD5_Go, a, b, c, d, t[ 1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 6], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[11], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 5], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[10], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[15], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 4], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 9], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[14], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 8], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[13], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 7], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[12], MD5C1f, MD5S13); + const u32 r0 = digest[0] ^ digest[2]; + const u32 r1 = digest[1] ^ digest[3]; + const u32 r2 = 0; + const u32 r3 = 0; - MD5_STEP (MD5_H , a, b, c, d, t[ 5], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 8], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[11], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[14], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 4], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 7], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[10], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[13], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 6], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 9], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[12], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[15], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, t[ 0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 7], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[14], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 5], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[12], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[10], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 8], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[15], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 6], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[13], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 4], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[11], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 9], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - a ^= c; - b ^= d; - c = 0; - d = 0; - - COMPARE_M_SIMD (a, b, c, d); + COMPARE_M_SIMD (r0, r1, r2, r3); } } @@ -734,174 +596,35 @@ __kernel void m07700_s04 (__global pw_t *pws, __global const kernel_rule_t *rule * md5 */ - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32 digest[4]; - MD5_STEP (MD5_Fo, a, b, c, d, t[ 0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 4], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 5], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 6], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 7], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 8], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 9], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[10], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[11], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[12], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[13], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[14], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[15], MD5C0f, MD5S03); + digest[0] = MD5M_A; + digest[1] = MD5M_B; + digest[2] = MD5M_C; + digest[3] = MD5M_D; - MD5_STEP (MD5_Go, a, b, c, d, t[ 1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 6], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[11], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 5], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[10], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[15], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 4], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 9], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[14], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 8], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[13], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 7], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[12], MD5C1f, MD5S13); + md5_transform (t + 0, t + 4, t + 8, t + 12, digest); - MD5_STEP (MD5_H , a, b, c, d, t[ 5], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 8], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[11], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[14], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 4], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 7], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[10], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[13], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 6], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 9], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[12], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[15], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, t[ 0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 7], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[14], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 5], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[12], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[10], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 8], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[15], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 6], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[13], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 4], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[11], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 9], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - const u32 sum20 = walld0rf_magic (w0, pw_len, salt_buf0, salt_len, a, b, c, d, t); + const u32 sum20 = walld0rf_magic (w0, pw_len, salt_buf0, salt_len, digest[0], digest[1], digest[2], digest[3], t); append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, sum20); t[14] = sum20 * 8; + t[15] = 0; - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; + digest[0] = MD5M_A; + digest[1] = MD5M_B; + digest[2] = MD5M_C; + digest[3] = MD5M_D; - MD5_STEP (MD5_Fo, a, b, c, d, t[ 0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 4], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 5], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 6], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 7], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 8], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 9], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[10], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[11], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[12], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[13], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[14], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[15], MD5C0f, MD5S03); + md5_transform (t + 0, t + 4, t + 8, t + 12, digest); - MD5_STEP (MD5_Go, a, b, c, d, t[ 1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 6], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[11], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 5], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[10], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[15], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 4], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 9], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[14], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 8], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[13], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 7], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[12], MD5C1f, MD5S13); + const u32 r0 = digest[0] ^ digest[2]; + const u32 r1 = digest[1] ^ digest[3]; + const u32 r2 = 0; + const u32 r3 = 0; - MD5_STEP (MD5_H , a, b, c, d, t[ 5], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 8], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[11], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[14], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 4], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 7], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[10], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[13], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 6], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 9], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[12], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[15], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, t[ 0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 7], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[14], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 5], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[12], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[10], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 8], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[15], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 6], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[13], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 4], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[11], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 9], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - a ^= c; - b ^= d; - c = 0; - d = 0; - - COMPARE_S_SIMD (a, b, c, d); + COMPARE_S_SIMD (r0, r1, r2, r3); } } diff --git a/OpenCL/m07700_a3-optimized.cl b/OpenCL/m07700_a3-optimized.cl index 81214dbce..9b0b3acdd 100644 --- a/OpenCL/m07700_a3-optimized.cl +++ b/OpenCL/m07700_a3-optimized.cl @@ -12,6 +12,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" #define GETCHAR(a,p) (((a)[(p) / 4] >> (((p) & 3) * 8)) & 0xff) #define PUTCHAR(a,p,c) ((a)[(p) / 4] = (((a)[(p) / 4] & ~(0xff << (((p) & 3) * 8))) | ((c) << (((p) & 3) * 8)))) @@ -85,10 +86,10 @@ u32 walld0rf_magic (const u32 w0[4], const u32 pw_len, const u32 salt_buf0[4], c t[15] = 0; u32 sum20 = ((a >> 24) & 3) - + ((a >> 16) & 3) - + ((a >> 8) & 3) - + ((a >> 0) & 3) - + ((b >> 8) & 3); + + ((a >> 16) & 3) + + ((a >> 8) & 3) + + ((a >> 0) & 3) + + ((b >> 8) & 3); sum20 |= 0x20; @@ -259,6 +260,8 @@ void m07700m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl s3[2] = 0; s3[3] = 0; + append_0x80_4x4_S (s0, s1, s2, s3, salt_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); const u32 pw_salt_len = pw_len + salt_len; @@ -284,7 +287,7 @@ void m07700m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl t[ 2] = s0[2]; t[ 3] = s0[3]; t[ 4] = s1[0]; - t[ 5] = 0; + t[ 5] = s1[1]; t[ 6] = 0; t[ 7] = 0; t[ 8] = 0; @@ -296,180 +299,39 @@ void m07700m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl t[14] = pw_salt_len * 8; t[15] = 0; - append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, pw_salt_len); - /** * md5 */ - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32 digest[4]; - MD5_STEP (MD5_Fo, a, b, c, d, t[ 0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 4], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 5], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 6], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 7], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 8], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 9], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[10], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[11], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[12], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[13], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[14], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[15], MD5C0f, MD5S03); + digest[0] = MD5M_A; + digest[1] = MD5M_B; + digest[2] = MD5M_C; + digest[3] = MD5M_D; - MD5_STEP (MD5_Go, a, b, c, d, t[ 1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 6], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[11], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 5], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[10], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[15], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 4], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 9], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[14], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 8], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[13], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 7], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[12], MD5C1f, MD5S13); + md5_transform (t + 0, t + 4, t + 8, t + 12, digest); - MD5_STEP (MD5_H , a, b, c, d, t[ 5], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 8], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[11], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[14], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 4], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 7], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[10], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[13], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 6], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 9], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[12], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[15], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, t[ 0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 7], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[14], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 5], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[12], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[10], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 8], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[15], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 6], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[13], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 4], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[11], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 9], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - const u32 sum20 = walld0rf_magic (w0, pw_len, salt_buf0, salt_len, a, b, c, d, t); + const u32 sum20 = walld0rf_magic (w0, pw_len, salt_buf0, salt_len, digest[0], digest[1], digest[2], digest[3], t); append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, sum20); t[14] = sum20 * 8; + t[15] = 0; - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; + digest[0] = MD5M_A; + digest[1] = MD5M_B; + digest[2] = MD5M_C; + digest[3] = MD5M_D; - MD5_STEP (MD5_Fo, a, b, c, d, t[ 0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 4], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 5], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 6], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 7], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 8], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 9], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[10], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[11], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[12], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[13], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[14], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[15], MD5C0f, MD5S03); + md5_transform (t + 0, t + 4, t + 8, t + 12, digest); - MD5_STEP (MD5_Go, a, b, c, d, t[ 1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 6], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[11], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 5], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[10], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[15], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 4], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 9], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[14], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 8], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[13], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 7], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[12], MD5C1f, MD5S13); + const u32 r0 = digest[0] ^ digest[2]; + const u32 r1 = digest[1] ^ digest[3]; + const u32 r2 = 0; + const u32 r3 = 0; - MD5_STEP (MD5_H , a, b, c, d, t[ 5], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 8], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[11], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[14], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 4], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 7], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[10], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[13], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 6], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 9], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[12], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[15], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, t[ 0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 7], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[14], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 5], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[12], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[10], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 8], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[15], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 6], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[13], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 4], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[11], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 9], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - a ^= c; - b ^= d; - c = 0; - d = 0; - - COMPARE_M_SIMD (a, b, c, d); + COMPARE_M_SIMD (r0, r1, r2, r3); } } @@ -523,6 +385,8 @@ void m07700s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl s3[2] = 0; s3[3] = 0; + append_0x80_4x4_S (s0, s1, s2, s3, salt_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); const u32 pw_salt_len = pw_len + salt_len; @@ -560,7 +424,7 @@ void m07700s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl t[ 2] = s0[2]; t[ 3] = s0[3]; t[ 4] = s1[0]; - t[ 5] = 0; + t[ 5] = s1[1]; t[ 6] = 0; t[ 7] = 0; t[ 8] = 0; @@ -572,180 +436,39 @@ void m07700s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl t[14] = pw_salt_len * 8; t[15] = 0; - append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, pw_salt_len); - /** * md5 */ - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32 digest[4]; - MD5_STEP (MD5_Fo, a, b, c, d, t[ 0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 4], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 5], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 6], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 7], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 8], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 9], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[10], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[11], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[12], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[13], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[14], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[15], MD5C0f, MD5S03); + digest[0] = MD5M_A; + digest[1] = MD5M_B; + digest[2] = MD5M_C; + digest[3] = MD5M_D; - MD5_STEP (MD5_Go, a, b, c, d, t[ 1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 6], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[11], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 5], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[10], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[15], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 4], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 9], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[14], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 8], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[13], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 7], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[12], MD5C1f, MD5S13); + md5_transform (t + 0, t + 4, t + 8, t + 12, digest); - MD5_STEP (MD5_H , a, b, c, d, t[ 5], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 8], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[11], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[14], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 4], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 7], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[10], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[13], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 6], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 9], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[12], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[15], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, t[ 0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 7], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[14], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 5], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[12], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[10], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 8], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[15], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 6], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[13], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 4], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[11], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 9], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - const u32 sum20 = walld0rf_magic (w0, pw_len, salt_buf0, salt_len, a, b, c, d, t); + const u32 sum20 = walld0rf_magic (w0, pw_len, salt_buf0, salt_len, digest[0], digest[1], digest[2], digest[3], t); append_0x80_4x4_S (t + 0, t + 4, t + 8, t + 12, sum20); t[14] = sum20 * 8; + t[15] = 0; - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; + digest[0] = MD5M_A; + digest[1] = MD5M_B; + digest[2] = MD5M_C; + digest[3] = MD5M_D; - MD5_STEP (MD5_Fo, a, b, c, d, t[ 0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 4], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 5], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[ 6], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[ 7], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[ 8], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[ 9], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[10], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[11], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, t[12], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, t[13], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, t[14], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, t[15], MD5C0f, MD5S03); + md5_transform (t + 0, t + 4, t + 8, t + 12, digest); - MD5_STEP (MD5_Go, a, b, c, d, t[ 1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 6], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[11], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 5], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[10], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[15], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 4], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[ 9], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[14], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[ 8], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, t[13], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, t[ 2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, t[ 7], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, t[12], MD5C1f, MD5S13); + const u32 r0 = digest[0] ^ digest[2]; + const u32 r1 = digest[1] ^ digest[3]; + const u32 r2 = 0; + const u32 r3 = 0; - MD5_STEP (MD5_H , a, b, c, d, t[ 5], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 8], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[11], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[14], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 4], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 7], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[10], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[13], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[ 0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[ 3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 6], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, t[ 9], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, t[12], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, t[15], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, t[ 2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, t[ 0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 7], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[14], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 5], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[12], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[ 3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[10], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 8], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[15], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 6], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[13], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, t[ 4], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, t[11], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, t[ 2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, t[ 9], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - a ^= c; - b ^= d; - c = 0; - d = 0; - - COMPARE_S_SIMD (a, b, c, d); + COMPARE_S_SIMD (r0, r1, r2, r3); } } From 5da64a1a432b729b116eaa46e7d6154207b9b3bf Mon Sep 17 00:00:00 2001 From: jsteube Date: Wed, 2 Aug 2017 13:24:54 +0200 Subject: [PATCH 43/75] Mix in pure kernel functions to SAP CODVN F/G (PASSCODE) --- OpenCL/m07800_a0-optimized.cl | 129 +--------------------------------- OpenCL/m07800_a1-optimized.cl | 129 +--------------------------------- OpenCL/m07800_a3-optimized.cl | 129 +--------------------------------- 3 files changed, 3 insertions(+), 384 deletions(-) diff --git a/OpenCL/m07800_a0-optimized.cl b/OpenCL/m07800_a0-optimized.cl index 9c0edfb9d..616e5483c 100644 --- a/OpenCL/m07800_a0-optimized.cl +++ b/OpenCL/m07800_a0-optimized.cl @@ -14,6 +14,7 @@ #include "inc_rp.h" #include "inc_rp.cl" #include "inc_simd.cl" +#include "inc_hash_sha1.cl" __constant u32 theMagicArray[64] = { @@ -52,134 +53,6 @@ void SETSHIFTEDINT (u32 *a, const int n, const u32 v) a[d + 1] = l32_from_64_S (tmp); } -void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) -{ - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} - __kernel void m07800_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** diff --git a/OpenCL/m07800_a1-optimized.cl b/OpenCL/m07800_a1-optimized.cl index 1d6e36112..3de195d64 100644 --- a/OpenCL/m07800_a1-optimized.cl +++ b/OpenCL/m07800_a1-optimized.cl @@ -12,6 +12,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_sha1.cl" __constant u32 theMagicArray[64] = { @@ -50,134 +51,6 @@ void SETSHIFTEDINT (u32 *a, const int n, const u32 v) a[d + 1] = l32_from_64_S (tmp); } -void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) -{ - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} - __kernel void m07800_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** diff --git a/OpenCL/m07800_a3-optimized.cl b/OpenCL/m07800_a3-optimized.cl index 2dfe9f04b..b57f9496b 100644 --- a/OpenCL/m07800_a3-optimized.cl +++ b/OpenCL/m07800_a3-optimized.cl @@ -12,6 +12,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_sha1.cl" __constant u32 theMagicArray[64] = { @@ -50,134 +51,6 @@ void SETSHIFTEDINT (u32 *a, const int n, const u32 v) a[d + 1] = l32_from_64_S (tmp); } -void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) -{ - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} - void m07800m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { /** From 4626270a1eebe536f515662e3ce0a4f87b593eef Mon Sep 17 00:00:00 2001 From: jsteube Date: Wed, 2 Aug 2017 13:36:00 +0200 Subject: [PATCH 44/75] Set maximum allowed password length for Sybase ASE to 30, according to documentation --- src/interface.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/interface.c b/src/interface.c index d5aca61ca..8a16f23b9 100644 --- a/src/interface.c +++ b/src/interface.c @@ -24745,6 +24745,7 @@ int hashconfig_init (hashcat_ctx_t *hashcat_ctx) case 7700: hashconfig->pw_max = 8; break; // https://www.daniel-berlin.de/security/sap-sec/password-hash-algorithms/ case 7800: hashconfig->pw_max = 40; break; // https://www.daniel-berlin.de/security/sap-sec/password-hash-algorithms/ case 7900: hashconfig->pw_max = PW_MAX; break; + case 8000: hashconfig->pw_max = 30; break; // http://infocenter.sybase.com/help/index.jsp?topic=/com.sybase.infocenter.dc31654.1570/html/sag1/CIHIBDBA.htm case 8200: hashconfig->pw_max = PW_MAX; break; case 8500: hashconfig->pw_max = 8; break; // Underlaying DES max case 8600: hashconfig->pw_max = 16; break; // Lotus Notes/Domino 5 limits itself to 8 From aafda5fa1b79b50041890b3d6f8e4827ec41eaf7 Mon Sep 17 00:00:00 2001 From: jsteube Date: Wed, 2 Aug 2017 14:12:27 +0200 Subject: [PATCH 45/75] Add pure kernels for Citrix NetScaler --- OpenCL/m08100_a0.cl | 142 ++++++++++++++++++++++++++++++++++++++++ OpenCL/m08100_a1.cl | 114 ++++++++++++++++++++++++++++++++ OpenCL/m08100_a3.cl | 156 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 412 insertions(+) create mode 100644 OpenCL/m08100_a0.cl create mode 100644 OpenCL/m08100_a1.cl create mode 100644 OpenCL/m08100_a3.cl diff --git a/OpenCL/m08100_a0.cl b/OpenCL/m08100_a0.cl new file mode 100644 index 000000000..8d087d790 --- /dev/null +++ b/OpenCL/m08100_a0.cl @@ -0,0 +1,142 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m08100_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx = ctx0; + + sha1_update_swap (&ctx, w, pw_len + 1); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m08100_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx = ctx0; + + sha1_update_swap (&ctx, w, pw_len + 1); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m08100_a1.cl b/OpenCL/m08100_a1.cl new file mode 100644 index 000000000..6fd9dcbec --- /dev/null +++ b/OpenCL/m08100_a1.cl @@ -0,0 +1,114 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m08100_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx = ctx0; + + sha1_update_global_swap (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len + 1); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m08100_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx = ctx0; + + sha1_update_global_swap (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len + 1); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m08100_a3.cl b/OpenCL/m08100_a3.cl new file mode 100644 index 000000000..59f0b281a --- /dev/null +++ b/OpenCL/m08100_a3.cl @@ -0,0 +1,156 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_sha1.cl" + +__kernel void m08100_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx; + + sha1_init_vector_from_scalar (&ctx, &ctx0); + + sha1_update_vector (&ctx, w, pw_len + 1); + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m08100_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx; + + sha1_init_vector_from_scalar (&ctx, &ctx0); + + sha1_update_vector (&ctx, w, pw_len + 1); + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} From 7cf3c29ef5c184e4d0466eaf6eb1351a342b5fa6 Mon Sep 17 00:00:00 2001 From: jsteube Date: Wed, 2 Aug 2017 14:34:36 +0200 Subject: [PATCH 46/75] Mix in pure kernel functions to DNSSEC (NSEC3) --- OpenCL/m08300_a0-optimized.cl | 137 ++-------------------------------- OpenCL/m08300_a1-optimized.cl | 137 ++-------------------------------- OpenCL/m08300_a3-optimized.cl | 137 ++-------------------------------- 3 files changed, 15 insertions(+), 396 deletions(-) diff --git a/OpenCL/m08300_a0-optimized.cl b/OpenCL/m08300_a0-optimized.cl index 654747a75..3d3ae0965 100644 --- a/OpenCL/m08300_a0-optimized.cl +++ b/OpenCL/m08300_a0-optimized.cl @@ -13,134 +13,7 @@ #include "inc_rp.h" #include "inc_rp.cl" #include "inc_simd.cl" - -void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5]) -{ - u32x A = digest[0]; - u32x B = digest[1]; - u32x C = digest[2]; - u32x D = digest[3]; - u32x E = digest[4]; - - u32x w0_t = w0[0]; - u32x w1_t = w0[1]; - u32x w2_t = w0[2]; - u32x w3_t = w0[3]; - u32x w4_t = w1[0]; - u32x w5_t = w1[1]; - u32x w6_t = w1[2]; - u32x w7_t = w1[3]; - u32x w8_t = w2[0]; - u32x w9_t = w2[1]; - u32x wa_t = w2[2]; - u32x wb_t = w2[3]; - u32x wc_t = w3[0]; - u32x wd_t = w3[1]; - u32x we_t = w3[2]; - u32x wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} +#include "inc_hash_sha1.cl" __kernel void m08300_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -355,7 +228,7 @@ __kernel void m08300_m04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); // iterations @@ -384,7 +257,7 @@ __kernel void m08300_m04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); } COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]); @@ -624,7 +497,7 @@ __kernel void m08300_s04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); // iterations @@ -653,7 +526,7 @@ __kernel void m08300_s04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); } COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]); diff --git a/OpenCL/m08300_a1-optimized.cl b/OpenCL/m08300_a1-optimized.cl index 889ee8c28..99590b478 100644 --- a/OpenCL/m08300_a1-optimized.cl +++ b/OpenCL/m08300_a1-optimized.cl @@ -11,134 +11,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" - -void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5]) -{ - u32x A = digest[0]; - u32x B = digest[1]; - u32x C = digest[2]; - u32x D = digest[3]; - u32x E = digest[4]; - - u32x w0_t = w0[0]; - u32x w1_t = w0[1]; - u32x w2_t = w0[2]; - u32x w3_t = w0[3]; - u32x w4_t = w1[0]; - u32x w5_t = w1[1]; - u32x w6_t = w1[2]; - u32x w7_t = w1[3]; - u32x w8_t = w2[0]; - u32x w9_t = w2[1]; - u32x wa_t = w2[2]; - u32x wb_t = w2[3]; - u32x wc_t = w3[0]; - u32x wd_t = w3[1]; - u32x we_t = w3[2]; - u32x wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} +#include "inc_hash_sha1.cl" __kernel void m08300_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -413,7 +286,7 @@ __kernel void m08300_m04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); // iterations @@ -442,7 +315,7 @@ __kernel void m08300_m04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); } COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]); @@ -742,7 +615,7 @@ __kernel void m08300_s04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); // iterations @@ -771,7 +644,7 @@ __kernel void m08300_s04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); } COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]); diff --git a/OpenCL/m08300_a3-optimized.cl b/OpenCL/m08300_a3-optimized.cl index 2aaa7222f..8c0d66eb8 100644 --- a/OpenCL/m08300_a3-optimized.cl +++ b/OpenCL/m08300_a3-optimized.cl @@ -11,134 +11,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" - -void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5]) -{ - u32x A = digest[0]; - u32x B = digest[1]; - u32x C = digest[2]; - u32x D = digest[3]; - u32x E = digest[4]; - - u32x w0_t = w0[0]; - u32x w1_t = w0[1]; - u32x w2_t = w0[2]; - u32x w3_t = w0[3]; - u32x w4_t = w1[0]; - u32x w5_t = w1[1]; - u32x w6_t = w1[2]; - u32x w7_t = w1[3]; - u32x w8_t = w2[0]; - u32x w9_t = w2[1]; - u32x wa_t = w2[2]; - u32x wb_t = w2[3]; - u32x wc_t = w3[0]; - u32x wd_t = w3[1]; - u32x we_t = w3[2]; - u32x wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} +#include "inc_hash_sha1.cl" void m08300m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { @@ -312,7 +185,7 @@ void m08300m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); // iterations @@ -341,7 +214,7 @@ void m08300m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); } COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]); @@ -532,7 +405,7 @@ void m08300s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); // iterations @@ -561,7 +434,7 @@ void m08300s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); } COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]); From c2a8ae0207eb31976a1a788d2d2396e1e22559f4 Mon Sep 17 00:00:00 2001 From: jsteube Date: Thu, 3 Aug 2017 12:21:53 +0200 Subject: [PATCH 47/75] Add pure kernels for WBB3 (Woltlab Burning Board) --- OpenCL/m08400_a0.cl | 360 +++++++++++++++++++++++++++++++++++++++++ OpenCL/m08400_a1.cl | 336 +++++++++++++++++++++++++++++++++++++++ OpenCL/m08400_a3.cl | 378 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1074 insertions(+) create mode 100644 OpenCL/m08400_a0.cl create mode 100644 OpenCL/m08400_a1.cl create mode 100644 OpenCL/m08400_a3.cl diff --git a/OpenCL/m08400_a0.cl b/OpenCL/m08400_a0.cl new file mode 100644 index 000000000..021a82f91 --- /dev/null +++ b/OpenCL/m08400_a0.cl @@ -0,0 +1,360 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m08400_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx1; + + sha1_init (&ctx1); + + sha1_update_swap (&ctx1, w, pw_len); + + sha1_final (&ctx1); + + u32 a = ctx1.h[0]; + u32 b = ctx1.h[1]; + u32 c = ctx1.h[2]; + u32 d = ctx1.h[3]; + u32 e = ctx1.h[4]; + + sha1_ctx_t ctx2 = ctx0; + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx2, w0, w1, w2, w3, 40); + + sha1_final (&ctx2); + + a = ctx2.h[0]; + b = ctx2.h[1]; + c = ctx2.h[2]; + d = ctx2.h[3]; + e = ctx2.h[4]; + + sha1_ctx_t ctx = ctx0; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx, w0, w1, w2, w3, 40); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m08400_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx1; + + sha1_init (&ctx1); + + sha1_update_swap (&ctx1, w, pw_len); + + sha1_final (&ctx1); + + u32 a = ctx1.h[0]; + u32 b = ctx1.h[1]; + u32 c = ctx1.h[2]; + u32 d = ctx1.h[3]; + u32 e = ctx1.h[4]; + + sha1_ctx_t ctx2 = ctx0; + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx2, w0, w1, w2, w3, 40); + + sha1_final (&ctx2); + + a = ctx2.h[0]; + b = ctx2.h[1]; + c = ctx2.h[2]; + d = ctx2.h[3]; + e = ctx2.h[4]; + + sha1_ctx_t ctx = ctx0; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx, w0, w1, w2, w3, 40); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m08400_a1.cl b/OpenCL/m08400_a1.cl new file mode 100644 index 000000000..f2ecd46e6 --- /dev/null +++ b/OpenCL/m08400_a1.cl @@ -0,0 +1,336 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m08400_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + sha1_ctx_t ctx1l; + + sha1_init (&ctx1l); + + sha1_update_global_swap (&ctx1l, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx1 = ctx1l; + + sha1_update_global_swap (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx1); + + u32 a = ctx1.h[0]; + u32 b = ctx1.h[1]; + u32 c = ctx1.h[2]; + u32 d = ctx1.h[3]; + u32 e = ctx1.h[4]; + + sha1_ctx_t ctx2 = ctx0; + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx2, w0, w1, w2, w3, 40); + + sha1_final (&ctx2); + + a = ctx2.h[0]; + b = ctx2.h[1]; + c = ctx2.h[2]; + d = ctx2.h[3]; + e = ctx2.h[4]; + + sha1_ctx_t ctx = ctx0; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx, w0, w1, w2, w3, 40); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m08400_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + sha1_ctx_t ctx1l; + + sha1_init (&ctx1l); + + sha1_update_global_swap (&ctx1l, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx1 = ctx1l; + + sha1_update_global_swap (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx1); + + u32 a = ctx1.h[0]; + u32 b = ctx1.h[1]; + u32 c = ctx1.h[2]; + u32 d = ctx1.h[3]; + u32 e = ctx1.h[4]; + + sha1_ctx_t ctx2 = ctx0; + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx2, w0, w1, w2, w3, 40); + + sha1_final (&ctx2); + + a = ctx2.h[0]; + b = ctx2.h[1]; + c = ctx2.h[2]; + d = ctx2.h[3]; + e = ctx2.h[4]; + + sha1_ctx_t ctx = ctx0; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx, w0, w1, w2, w3, 40); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m08400_a3.cl b/OpenCL/m08400_a3.cl new file mode 100644 index 000000000..769795b39 --- /dev/null +++ b/OpenCL/m08400_a3.cl @@ -0,0 +1,378 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m08400_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0lr = w0l | w0r; + + w[0] = w0lr; + + sha1_ctx_vector_t ctx1; + + sha1_init_vector (&ctx1); + + sha1_update_vector (&ctx1, w, pw_len); + + sha1_final_vector (&ctx1); + + u32x a = ctx1.h[0]; + u32x b = ctx1.h[1]; + u32x c = ctx1.h[2]; + u32x d = ctx1.h[3]; + u32x e = ctx1.h[4]; + + sha1_ctx_vector_t ctx2; + + sha1_init_vector_from_scalar (&ctx2, &ctx0); + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_vector_64 (&ctx2, w0, w1, w2, w3, 40); + + sha1_final_vector (&ctx2); + + a = ctx2.h[0]; + b = ctx2.h[1]; + c = ctx2.h[2]; + d = ctx2.h[3]; + e = ctx2.h[4]; + + sha1_ctx_vector_t ctx; + + sha1_init_vector_from_scalar (&ctx, &ctx0); + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_vector_64 (&ctx, w0, w1, w2, w3, 40); + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m08400_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0lr = w0l | w0r; + + w[0] = w0lr; + + sha1_ctx_vector_t ctx1; + + sha1_init_vector (&ctx1); + + sha1_update_vector (&ctx1, w, pw_len); + + sha1_final_vector (&ctx1); + + u32x a = ctx1.h[0]; + u32x b = ctx1.h[1]; + u32x c = ctx1.h[2]; + u32x d = ctx1.h[3]; + u32x e = ctx1.h[4]; + + sha1_ctx_vector_t ctx2; + + sha1_init_vector_from_scalar (&ctx2, &ctx0); + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_vector_64 (&ctx2, w0, w1, w2, w3, 40); + + sha1_final_vector (&ctx2); + + a = ctx2.h[0]; + b = ctx2.h[1]; + c = ctx2.h[2]; + d = ctx2.h[3]; + e = ctx2.h[4]; + + sha1_ctx_vector_t ctx; + + sha1_init_vector_from_scalar (&ctx, &ctx0); + + w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_vector_64 (&ctx, w0, w1, w2, w3, 40); + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} From 54eb0b158d5f62feb88eb72f7154fd7aeec9d01f Mon Sep 17 00:00:00 2001 From: jsteube Date: Thu, 3 Aug 2017 12:35:05 +0200 Subject: [PATCH 48/75] Prepare DNSSEC (NSEC3) optimized kernel for pure kernel version --- OpenCL/m08300_a0-optimized.cl | 4 ++-- OpenCL/m08300_a1-optimized.cl | 4 ++-- OpenCL/m08300_a3-optimized.cl | 4 ++-- src/interface.c | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/OpenCL/m08300_a0-optimized.cl b/OpenCL/m08300_a0-optimized.cl index 3d3ae0965..0e3ed4fd2 100644 --- a/OpenCL/m08300_a0-optimized.cl +++ b/OpenCL/m08300_a0-optimized.cl @@ -77,7 +77,7 @@ __kernel void m08300_m04 (__global pw_t *pws, __global const kernel_rule_t *rule domain_buf1[2] = salt_bufs[salt_pos].salt_buf_pc[ 6]; domain_buf1[3] = 0; - const u32 domain_len = salt_bufs[salt_pos].salt_buf_pc[ 7]; + const u32 domain_len = salt_bufs[salt_pos].salt_len_pc; /** * loop @@ -334,7 +334,7 @@ __kernel void m08300_s04 (__global pw_t *pws, __global const kernel_rule_t *rule domain_buf1[2] = salt_bufs[salt_pos].salt_buf_pc[ 6]; domain_buf1[3] = 0; - const u32 domain_len = salt_bufs[salt_pos].salt_buf_pc[ 7]; + const u32 domain_len = salt_bufs[salt_pos].salt_len_pc; /** * digest diff --git a/OpenCL/m08300_a1-optimized.cl b/OpenCL/m08300_a1-optimized.cl index 99590b478..3ed52170c 100644 --- a/OpenCL/m08300_a1-optimized.cl +++ b/OpenCL/m08300_a1-optimized.cl @@ -75,7 +75,7 @@ __kernel void m08300_m04 (__global pw_t *pws, __global const kernel_rule_t *rule domain_buf1[2] = salt_bufs[salt_pos].salt_buf_pc[ 6]; domain_buf1[3] = 0; - const u32 domain_len = salt_bufs[salt_pos].salt_buf_pc[ 7]; + const u32 domain_len = salt_bufs[salt_pos].salt_len_pc; /** * loop @@ -392,7 +392,7 @@ __kernel void m08300_s04 (__global pw_t *pws, __global const kernel_rule_t *rule domain_buf1[2] = salt_bufs[salt_pos].salt_buf_pc[ 6]; domain_buf1[3] = 0; - const u32 domain_len = salt_bufs[salt_pos].salt_buf_pc[ 7]; + const u32 domain_len = salt_bufs[salt_pos].salt_len_pc; /** * digest diff --git a/OpenCL/m08300_a3-optimized.cl b/OpenCL/m08300_a3-optimized.cl index 8c0d66eb8..6827e92f9 100644 --- a/OpenCL/m08300_a3-optimized.cl +++ b/OpenCL/m08300_a3-optimized.cl @@ -54,7 +54,7 @@ void m08300m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl domain_buf1[2] = swap32_S (salt_bufs[salt_pos].salt_buf_pc[ 6]); domain_buf1[3] = 0; - const u32 domain_len = salt_bufs[salt_pos].salt_buf_pc[ 7]; + const u32 domain_len = salt_bufs[salt_pos].salt_len_pc; u32 s0[4]; u32 s1[4]; @@ -262,7 +262,7 @@ void m08300s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl domain_buf1[2] = swap32_S (salt_bufs[salt_pos].salt_buf_pc[ 6]); domain_buf1[3] = 0; - const u32 domain_len = salt_bufs[salt_pos].salt_buf_pc[ 7]; + const u32 domain_len = salt_bufs[salt_pos].salt_len_pc; u32 s0[4]; u32 s1[4]; diff --git a/src/interface.c b/src/interface.c index 8a16f23b9..423846406 100644 --- a/src/interface.c +++ b/src/interface.c @@ -8004,7 +8004,7 @@ int nsec3_parse_hash (u8 *input_buf, u32 input_len, hash_t *hash_buf, MAYBE_UNUS } } - salt->salt_buf_pc[7] = domainbuf_len; + salt->salt_len_pc = domainbuf_len; // "real" salt @@ -17612,7 +17612,7 @@ int ascii_digest (hashcat_ctx_t *hashcat_ctx, char *out_buf, const size_t out_le // domain - const u32 salt_pc_len = salt.salt_buf_pc[7]; // what a hack + const u32 salt_pc_len = salt.salt_len_pc; char domain_buf_c[33] = { 0 }; From c68191e47a21a81114bf91651015a0653d1a6384 Mon Sep 17 00:00:00 2001 From: jsteube Date: Thu, 3 Aug 2017 13:05:30 +0200 Subject: [PATCH 49/75] Add pure kernels for DNSSEC (NSEC3) --- OpenCL/m08300_a0.cl | 276 ++++++++++++++++++++++++++++++++++++++++++ OpenCL/m08300_a1.cl | 248 ++++++++++++++++++++++++++++++++++++++ OpenCL/m08300_a3.cl | 286 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 810 insertions(+) create mode 100644 OpenCL/m08300_a0.cl create mode 100644 OpenCL/m08300_a1.cl create mode 100644 OpenCL/m08300_a3.cl diff --git a/OpenCL/m08300_a0.cl b/OpenCL/m08300_a0.cl new file mode 100644 index 000000000..39cdf7d2a --- /dev/null +++ b/OpenCL/m08300_a0.cl @@ -0,0 +1,276 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m08300_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + + const u32 salt_lenv = ceil ((float) salt_len / 4); + + u32 s[64] = { 0 }; + + for (int idx = 0; idx < salt_lenv; idx++) + { + s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + const u32 salt_len_pc = salt_bufs[salt_pos].salt_len_pc; + + const u32 salt_len_pcv = ceil ((float) salt_len_pc / 4); + + u32 s_pc[64] = { 0 }; + + for (int idx = 0; idx < salt_len_pcv; idx++) + { + s_pc[idx] = swap32_S (salt_bufs[salt_pos].salt_buf_pc[idx]); + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + const u32 salt_iter = salt_bufs[salt_pos].salt_iter; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx1; + + sha1_init (&ctx1); + + ctx1.w0[0] = (pw_len & 0xff) << 24; + + ctx1.len = 1; + + sha1_update_swap (&ctx1, w, pw_len); + + sha1_update (&ctx1, s_pc, salt_len_pc + 1); + + sha1_update (&ctx1, s, salt_len); + + sha1_final (&ctx1); + + u32 digest[5]; + + digest[0] = ctx1.h[0]; + digest[1] = ctx1.h[1]; + digest[2] = ctx1.h[2]; + digest[3] = ctx1.h[3]; + digest[4] = ctx1.h[4]; + + // iterations + + for (u32 i = 0; i < salt_iter; i++) + { + sha1_ctx_t ctx; + + sha1_init (&ctx); + + ctx.w0[0] = digest[0]; + ctx.w0[1] = digest[1]; + ctx.w0[2] = digest[2]; + ctx.w0[3] = digest[3]; + ctx.w1[0] = digest[4]; + + ctx.len = 20; + + sha1_update (&ctx, s, salt_len); + + sha1_final (&ctx); + + digest[0] = ctx.h[0]; + digest[1] = ctx.h[1]; + digest[2] = ctx.h[2]; + digest[3] = ctx.h[3]; + digest[4] = ctx.h[4]; + } + + const u32 r0 = digest[DGST_R0]; + const u32 r1 = digest[DGST_R1]; + const u32 r2 = digest[DGST_R2]; + const u32 r3 = digest[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m08300_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + + const u32 salt_lenv = ceil ((float) salt_len / 4); + + u32 s[64] = { 0 }; + + for (int idx = 0; idx < salt_lenv; idx++) + { + s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + const u32 salt_len_pc = salt_bufs[salt_pos].salt_len_pc; + + const u32 salt_len_pcv = ceil ((float) salt_len_pc / 4); + + u32 s_pc[64] = { 0 }; + + for (int idx = 0; idx < salt_len_pcv; idx++) + { + s_pc[idx] = swap32_S (salt_bufs[salt_pos].salt_buf_pc[idx]); + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + const u32 salt_iter = salt_bufs[salt_pos].salt_iter; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx1; + + sha1_init (&ctx1); + + ctx1.w0[0] = (pw_len & 0xff) << 24; + + ctx1.len = 1; + + sha1_update_swap (&ctx1, w, pw_len); + + sha1_update (&ctx1, s_pc, salt_len_pc + 1); + + sha1_update (&ctx1, s, salt_len); + + sha1_final (&ctx1); + + u32 digest[5]; + + digest[0] = ctx1.h[0]; + digest[1] = ctx1.h[1]; + digest[2] = ctx1.h[2]; + digest[3] = ctx1.h[3]; + digest[4] = ctx1.h[4]; + + // iterations + + for (u32 i = 0; i < salt_iter; i++) + { + sha1_ctx_t ctx; + + sha1_init (&ctx); + + ctx.w0[0] = digest[0]; + ctx.w0[1] = digest[1]; + ctx.w0[2] = digest[2]; + ctx.w0[3] = digest[3]; + ctx.w1[0] = digest[4]; + + ctx.len = 20; + + sha1_update (&ctx, s, salt_len); + + sha1_final (&ctx); + + digest[0] = ctx.h[0]; + digest[1] = ctx.h[1]; + digest[2] = ctx.h[2]; + digest[3] = ctx.h[3]; + digest[4] = ctx.h[4]; + } + + const u32 r0 = digest[DGST_R0]; + const u32 r1 = digest[DGST_R1]; + const u32 r2 = digest[DGST_R2]; + const u32 r3 = digest[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m08300_a1.cl b/OpenCL/m08300_a1.cl new file mode 100644 index 000000000..57904d8d5 --- /dev/null +++ b/OpenCL/m08300_a1.cl @@ -0,0 +1,248 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m08300_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + + const u32 salt_lenv = ceil ((float) salt_len / 4); + + u32 s[64] = { 0 }; + + for (int idx = 0; idx < salt_lenv; idx++) + { + s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + const u32 salt_len_pc = salt_bufs[salt_pos].salt_len_pc; + + const u32 salt_len_pcv = ceil ((float) salt_len_pc / 4); + + u32 s_pc[64] = { 0 }; + + for (int idx = 0; idx < salt_len_pcv; idx++) + { + s_pc[idx] = swap32_S (salt_bufs[salt_pos].salt_buf_pc[idx]); + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + const u32 salt_iter = salt_bufs[salt_pos].salt_iter; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx1; + + sha1_init (&ctx1); + + ctx1.w0[0] = ((pws[gid].pw_len + combs_buf[il_pos].pw_len) & 0xff) << 24; + + ctx1.len = 1; + + sha1_update_global_swap (&ctx1, pws[gid].i, pws[gid].pw_len); + + sha1_update_global_swap (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_update (&ctx1, s_pc, salt_len_pc + 1); + + sha1_update (&ctx1, s, salt_len); + + sha1_final (&ctx1); + + u32 digest[5]; + + digest[0] = ctx1.h[0]; + digest[1] = ctx1.h[1]; + digest[2] = ctx1.h[2]; + digest[3] = ctx1.h[3]; + digest[4] = ctx1.h[4]; + + // iterations + + for (u32 i = 0; i < salt_iter; i++) + { + sha1_ctx_t ctx; + + sha1_init (&ctx); + + ctx.w0[0] = digest[0]; + ctx.w0[1] = digest[1]; + ctx.w0[2] = digest[2]; + ctx.w0[3] = digest[3]; + ctx.w1[0] = digest[4]; + + ctx.len = 20; + + sha1_update (&ctx, s, salt_len); + + sha1_final (&ctx); + + digest[0] = ctx.h[0]; + digest[1] = ctx.h[1]; + digest[2] = ctx.h[2]; + digest[3] = ctx.h[3]; + digest[4] = ctx.h[4]; + } + + const u32 r0 = digest[DGST_R0]; + const u32 r1 = digest[DGST_R1]; + const u32 r2 = digest[DGST_R2]; + const u32 r3 = digest[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m08300_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + + const u32 salt_lenv = ceil ((float) salt_len / 4); + + u32 s[64] = { 0 }; + + for (int idx = 0; idx < salt_lenv; idx++) + { + s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + const u32 salt_len_pc = salt_bufs[salt_pos].salt_len_pc; + + const u32 salt_len_pcv = ceil ((float) salt_len_pc / 4); + + u32 s_pc[64] = { 0 }; + + for (int idx = 0; idx < salt_len_pcv; idx++) + { + s_pc[idx] = swap32_S (salt_bufs[salt_pos].salt_buf_pc[idx]); + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + const u32 salt_iter = salt_bufs[salt_pos].salt_iter; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx1; + + sha1_init (&ctx1); + + ctx1.w0[0] = ((pws[gid].pw_len + combs_buf[il_pos].pw_len) & 0xff) << 24; + + ctx1.len = 1; + + sha1_update_global_swap (&ctx1, pws[gid].i, pws[gid].pw_len); + + sha1_update_global_swap (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_update (&ctx1, s_pc, salt_len_pc + 1); + + sha1_update (&ctx1, s, salt_len); + + sha1_final (&ctx1); + + u32 digest[5]; + + digest[0] = ctx1.h[0]; + digest[1] = ctx1.h[1]; + digest[2] = ctx1.h[2]; + digest[3] = ctx1.h[3]; + digest[4] = ctx1.h[4]; + + // iterations + + for (u32 i = 0; i < salt_iter; i++) + { + sha1_ctx_t ctx; + + sha1_init (&ctx); + + ctx.w0[0] = digest[0]; + ctx.w0[1] = digest[1]; + ctx.w0[2] = digest[2]; + ctx.w0[3] = digest[3]; + ctx.w1[0] = digest[4]; + + ctx.len = 20; + + sha1_update (&ctx, s, salt_len); + + sha1_final (&ctx); + + digest[0] = ctx.h[0]; + digest[1] = ctx.h[1]; + digest[2] = ctx.h[2]; + digest[3] = ctx.h[3]; + digest[4] = ctx.h[4]; + } + + const u32 r0 = digest[DGST_R0]; + const u32 r1 = digest[DGST_R1]; + const u32 r2 = digest[DGST_R2]; + const u32 r3 = digest[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m08300_a3.cl b/OpenCL/m08300_a3.cl new file mode 100644 index 000000000..0fc7148f0 --- /dev/null +++ b/OpenCL/m08300_a3.cl @@ -0,0 +1,286 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_sha1.cl" + +__kernel void m08300_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + + const u32 salt_lenv = ceil ((float) salt_len / 4); + + u32x s[64] = { 0 }; + + for (int idx = 0; idx < salt_lenv; idx++) + { + s[idx] = swap32 (salt_bufs[salt_pos].salt_buf[idx]); + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + const u32 salt_len_pc = salt_bufs[salt_pos].salt_len_pc; + + const u32 salt_len_pcv = ceil ((float) salt_len_pc / 4); + + u32x s_pc[64] = { 0 }; + + for (int idx = 0; idx < salt_len_pcv; idx++) + { + s_pc[idx] = swap32 (salt_bufs[salt_pos].salt_buf_pc[idx]); + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + const u32 salt_iter = salt_bufs[salt_pos].salt_iter; + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx1; + + sha1_init_vector (&ctx1); + + ctx1.w0[0] = (pw_len & 0xff) << 24; + + ctx1.len = 1; + + sha1_update_vector (&ctx1, w, pw_len); + + sha1_update_vector (&ctx1, s_pc, salt_len_pc + 1); + + sha1_update_vector (&ctx1, s, salt_len); + + sha1_final_vector (&ctx1); + + u32x digest[5]; + + digest[0] = ctx1.h[0]; + digest[1] = ctx1.h[1]; + digest[2] = ctx1.h[2]; + digest[3] = ctx1.h[3]; + digest[4] = ctx1.h[4]; + + // iterations + + for (u32 i = 0; i < salt_iter; i++) + { + sha1_ctx_vector_t ctx; + + sha1_init_vector (&ctx); + + ctx.w0[0] = digest[0]; + ctx.w0[1] = digest[1]; + ctx.w0[2] = digest[2]; + ctx.w0[3] = digest[3]; + ctx.w1[0] = digest[4]; + + ctx.len = 20; + + sha1_update_vector (&ctx, s, salt_len); + + sha1_final_vector (&ctx); + + digest[0] = ctx.h[0]; + digest[1] = ctx.h[1]; + digest[2] = ctx.h[2]; + digest[3] = ctx.h[3]; + digest[4] = ctx.h[4]; + } + + const u32x r0 = digest[DGST_R0]; + const u32x r1 = digest[DGST_R1]; + const u32x r2 = digest[DGST_R2]; + const u32x r3 = digest[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m08300_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + const u32 salt_len = salt_bufs[salt_pos].salt_len; + + const u32 salt_lenv = ceil ((float) salt_len / 4); + + u32x s[64] = { 0 }; + + for (int idx = 0; idx < salt_lenv; idx++) + { + s[idx] = swap32 (salt_bufs[salt_pos].salt_buf[idx]); + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + const u32 salt_len_pc = salt_bufs[salt_pos].salt_len_pc; + + const u32 salt_len_pcv = ceil ((float) salt_len_pc / 4); + + u32x s_pc[64] = { 0 }; + + for (int idx = 0; idx < salt_len_pcv; idx++) + { + s_pc[idx] = swap32 (salt_bufs[salt_pos].salt_buf_pc[idx]); + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + const u32 salt_iter = salt_bufs[salt_pos].salt_iter; + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx1; + + sha1_init_vector (&ctx1); + + ctx1.w0[0] = (pw_len & 0xff) << 24; + + ctx1.len = 1; + + sha1_update_vector (&ctx1, w, pw_len); + + sha1_update_vector (&ctx1, s_pc, salt_len_pc + 1); + + sha1_update_vector (&ctx1, s, salt_len); + + sha1_final_vector (&ctx1); + + u32x digest[5]; + + digest[0] = ctx1.h[0]; + digest[1] = ctx1.h[1]; + digest[2] = ctx1.h[2]; + digest[3] = ctx1.h[3]; + digest[4] = ctx1.h[4]; + + // iterations + + for (u32 i = 0; i < salt_iter; i++) + { + sha1_ctx_vector_t ctx; + + sha1_init_vector (&ctx); + + ctx.w0[0] = digest[0]; + ctx.w0[1] = digest[1]; + ctx.w0[2] = digest[2]; + ctx.w0[3] = digest[3]; + ctx.w1[0] = digest[4]; + + ctx.len = 20; + + sha1_update_vector (&ctx, s, salt_len); + + sha1_final_vector (&ctx); + + digest[0] = ctx.h[0]; + digest[1] = ctx.h[1]; + digest[2] = ctx.h[2]; + digest[3] = ctx.h[3]; + digest[4] = ctx.h[4]; + } + + const u32x r0 = digest[DGST_R0]; + const u32x r1 = digest[DGST_R1]; + const u32x r2 = digest[DGST_R2]; + const u32x r3 = digest[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} From 344d1a37dfe3ce4ed751b5f0d68fde4aa93221de Mon Sep 17 00:00:00 2001 From: philsmd Date: Thu, 3 Aug 2017 14:02:09 +0200 Subject: [PATCH 50/75] fixes 1306: every permanent chdir () needs to update the folder_config --- docs/changes.txt | 1 + src/restore.c | 63 +++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 52 insertions(+), 12 deletions(-) diff --git a/docs/changes.txt b/docs/changes.txt index 6ca3df4a5..407876b44 100644 --- a/docs/changes.txt +++ b/docs/changes.txt @@ -20,6 +20,7 @@ ## - Fixed a parser error for mode -m 9820 = MS Office <= 2003 $3, SHA1 + RC4, collider #2 +- Fixed a problem with changed current working directory, for instance by using --restore together with --remove ## ## Improvements diff --git a/src/restore.c b/src/restore.c index 9d576e99e..82a2e14b2 100644 --- a/src/restore.c +++ b/src/restore.c @@ -9,6 +9,8 @@ #include "event.h" #include "user_options.h" #include "shared.h" +#include "pidfile.h" +#include "folder.h" #include "restore.h" #if defined (_WIN) @@ -45,7 +47,8 @@ static int init_restore (hashcat_ctx_t *hashcat_ctx) static int read_restore (hashcat_ctx_t *hashcat_ctx) { - restore_ctx_t *restore_ctx = hashcat_ctx->restore_ctx; + restore_ctx_t *restore_ctx = hashcat_ctx->restore_ctx; + folder_config_t *folder_config = hashcat_ctx->folder_config; if (restore_ctx->enabled == false) return 0; @@ -131,20 +134,56 @@ static int read_restore (hashcat_ctx_t *hashcat_ctx) return -1; } - event_log_warning (hashcat_ctx, "Changing current working directory to '%s'", rd->cwd); - event_log_warning (hashcat_ctx, NULL); - - if (chdir (rd->cwd)) + if (strncmp (rd->cwd, folder_config->cwd, sizeof (rd->cwd)) != 0) // check if we need to change the current working directory { - event_log_error (hashcat_ctx, "Directory '%s' needed to restore the session was not found.", rd->cwd); - - event_log_warning (hashcat_ctx, "Either create the directory, or update the directory within the .restore file."); - event_log_warning (hashcat_ctx, "Restore files can be analyzed and modified with analyze_hc_restore.pl:"); - event_log_warning (hashcat_ctx, " https://github.com/philsmd/analyze_hc_restore"); - event_log_warning (hashcat_ctx, "Directory must contain all files and folders from the original command line."); + event_log_warning (hashcat_ctx, "Changing current working directory to '%s'", rd->cwd); event_log_warning (hashcat_ctx, NULL); - return -1; + if (chdir (rd->cwd)) + { + event_log_error (hashcat_ctx, "Directory '%s' needed to restore the session was not found.", rd->cwd); + + event_log_warning (hashcat_ctx, "Either create the directory, or update the directory within the .restore file."); + event_log_warning (hashcat_ctx, "Restore files can be analyzed and modified with analyze_hc_restore.pl:"); + event_log_warning (hashcat_ctx, " https://github.com/philsmd/analyze_hc_restore"); + event_log_warning (hashcat_ctx, "Directory must contain all files and folders from the original command line."); + event_log_warning (hashcat_ctx, NULL); + + return -1; + } + + // if we are here, we also need to update the folder_config and .pid file: + + /** + * updated folders + */ + + const char *install_folder = NULL; + const char *shared_folder = NULL; + + #if defined (INSTALL_FOLDER) + install_folder = INSTALL_FOLDER; + #endif + + #if defined (SHARED_FOLDER) + shared_folder = SHARED_FOLDER; + #endif + + folder_config_destroy (hashcat_ctx); + + const int rc_folder_config_init = folder_config_init (hashcat_ctx, install_folder, shared_folder); + + if (rc_folder_config_init == -1) return -1; + + /** + * updated pidfile + */ + + pidfile_ctx_destroy (hashcat_ctx); + + const int rc_pidfile_init = pidfile_ctx_init (hashcat_ctx); + + if (rc_pidfile_init == -1) return -1; } return 0; From 1f42377931053418283c505c169e3b1b04f1c7e6 Mon Sep 17 00:00:00 2001 From: jsteube Date: Thu, 3 Aug 2017 14:11:31 +0200 Subject: [PATCH 51/75] Simplify Lotus Notes/Domino 5 kernel --- .../{m08600_a0-optimized.cl => m08600_a0.cl} | 20 +- .../{m08600_a1-optimized.cl => m08600_a1.cl} | 20 +- .../{m08600_a3-optimized.cl => m08600_a3.cl} | 232 +----------------- src/interface.c | 2 +- 4 files changed, 7 insertions(+), 267 deletions(-) rename OpenCL/{m08600_a0-optimized.cl => m08600_a0.cl} (73%) rename OpenCL/{m08600_a1-optimized.cl => m08600_a1.cl} (76%) rename OpenCL/{m08600_a3-optimized.cl => m08600_a3.cl} (60%) diff --git a/OpenCL/m08600_a0-optimized.cl b/OpenCL/m08600_a0.cl similarity index 73% rename from OpenCL/m08600_a0-optimized.cl rename to OpenCL/m08600_a0.cl index d64bcc9ba..4e65a519b 100644 --- a/OpenCL/m08600_a0-optimized.cl +++ b/OpenCL/m08600_a0.cl @@ -229,7 +229,7 @@ void domino_big_md (const u32x saved_key[4], const u32 size, u32x state[4], __lo mdtransform_norecalc (state, checksum, s_lotus_magic_table); } -__kernel void m08600_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08600_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -308,15 +308,7 @@ __kernel void m08600_m04 (__global pw_t *pws, __global const kernel_rule_t *rule } } -__kernel void m08600_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08600_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08600_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08600_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -406,11 +398,3 @@ __kernel void m08600_s04 (__global pw_t *pws, __global const kernel_rule_t *rule COMPARE_S_SIMD (state[0], state[1], state[2], state[3]); } } - -__kernel void m08600_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08600_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} diff --git a/OpenCL/m08600_a1-optimized.cl b/OpenCL/m08600_a1.cl similarity index 76% rename from OpenCL/m08600_a1-optimized.cl rename to OpenCL/m08600_a1.cl index 28aaa4c45..d169f79bf 100644 --- a/OpenCL/m08600_a1-optimized.cl +++ b/OpenCL/m08600_a1.cl @@ -227,7 +227,7 @@ void domino_big_md (const u32x saved_key[4], const u32 size, u32x state[4], __lo mdtransform_norecalc (state, checksum, s_lotus_magic_table); } -__kernel void m08600_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08600_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -351,15 +351,7 @@ __kernel void m08600_m04 (__global pw_t *pws, __global const kernel_rule_t *rule } } -__kernel void m08600_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08600_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08600_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08600_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -494,11 +486,3 @@ __kernel void m08600_s04 (__global pw_t *pws, __global const kernel_rule_t *rule COMPARE_S_SIMD (state[0], state[1], state[2], state[3]); } } - -__kernel void m08600_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08600_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} diff --git a/OpenCL/m08600_a3-optimized.cl b/OpenCL/m08600_a3.cl similarity index 60% rename from OpenCL/m08600_a3-optimized.cl rename to OpenCL/m08600_a3.cl index 1d85b39d7..4c881033e 100644 --- a/OpenCL/m08600_a3-optimized.cl +++ b/OpenCL/m08600_a3.cl @@ -332,7 +332,7 @@ void m08600s (__local u32 *s_lotus_magic_table, u32 w[16], const u32 pw_len, __g } } -__kernel void m08600_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08600_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -389,121 +389,7 @@ __kernel void m08600_m04 (__global pw_t *pws, __global const kernel_rule_t *rule m08600m (s_lotus_magic_table, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); } -__kernel void m08600_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * base - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - const u32 lsz = get_local_size (0); - - /** - * sbox - */ - - __local u32 s_lotus_magic_table[256]; - - for (u32 i = lid; i < 256; i += lsz) - { - s_lotus_magic_table[i] = lotus_magic_table[i]; - } - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - - /** - * base - */ - - u32 w[16]; - - w[ 0] = pws[gid].i[ 0]; - w[ 1] = pws[gid].i[ 1]; - w[ 2] = pws[gid].i[ 2]; - w[ 3] = pws[gid].i[ 3]; - w[ 4] = pws[gid].i[ 4]; - w[ 5] = pws[gid].i[ 5]; - w[ 6] = pws[gid].i[ 6]; - w[ 7] = pws[gid].i[ 7]; - w[ 8] = 0; - w[ 9] = 0; - w[10] = 0; - w[11] = 0; - w[12] = 0; - w[13] = 0; - w[14] = 0; - w[15] = 0; - - const u32 pw_len = pws[gid].pw_len; - - /** - * main - */ - - m08600m (s_lotus_magic_table, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); -} - -__kernel void m08600_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * base - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - const u32 lsz = get_local_size (0); - - /** - * sbox - */ - - __local u32 s_lotus_magic_table[256]; - - for (u32 i = lid; i < 256; i += lsz) - { - s_lotus_magic_table[i] = lotus_magic_table[i]; - } - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - - /** - * base - */ - - u32 w[16]; - - w[ 0] = pws[gid].i[ 0]; - w[ 1] = pws[gid].i[ 1]; - w[ 2] = pws[gid].i[ 2]; - w[ 3] = pws[gid].i[ 3]; - w[ 4] = pws[gid].i[ 4]; - w[ 5] = pws[gid].i[ 5]; - w[ 6] = pws[gid].i[ 6]; - w[ 7] = pws[gid].i[ 7]; - w[ 8] = pws[gid].i[ 8]; - w[ 9] = pws[gid].i[ 9]; - w[10] = pws[gid].i[10]; - w[11] = pws[gid].i[11]; - w[12] = pws[gid].i[12]; - w[13] = pws[gid].i[13]; - w[14] = pws[gid].i[14]; - w[15] = pws[gid].i[15]; - - const u32 pw_len = pws[gid].pw_len; - - /** - * main - */ - - m08600m (s_lotus_magic_table, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); -} - -__kernel void m08600_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08600_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -559,117 +445,3 @@ __kernel void m08600_s04 (__global pw_t *pws, __global const kernel_rule_t *rule m08600s (s_lotus_magic_table, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); } - -__kernel void m08600_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * base - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - const u32 lsz = get_local_size (0); - - /** - * sbox - */ - - __local u32 s_lotus_magic_table[256]; - - for (u32 i = lid; i < 256; i += lsz) - { - s_lotus_magic_table[i] = lotus_magic_table[i]; - } - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - - /** - * base - */ - - u32 w[16]; - - w[ 0] = pws[gid].i[ 0]; - w[ 1] = pws[gid].i[ 1]; - w[ 2] = pws[gid].i[ 2]; - w[ 3] = pws[gid].i[ 3]; - w[ 4] = pws[gid].i[ 4]; - w[ 5] = pws[gid].i[ 5]; - w[ 6] = pws[gid].i[ 6]; - w[ 7] = pws[gid].i[ 7]; - w[ 8] = 0; - w[ 9] = 0; - w[10] = 0; - w[11] = 0; - w[12] = 0; - w[13] = 0; - w[14] = 0; - w[15] = 0; - - const u32 pw_len = pws[gid].pw_len; - - /** - * main - */ - - m08600s (s_lotus_magic_table, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); -} - -__kernel void m08600_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * base - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - const u32 lsz = get_local_size (0); - - /** - * sbox - */ - - __local u32 s_lotus_magic_table[256]; - - for (u32 i = lid; i < 256; i += lsz) - { - s_lotus_magic_table[i] = lotus_magic_table[i]; - } - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - - /** - * base - */ - - u32 w[16]; - - w[ 0] = pws[gid].i[ 0]; - w[ 1] = pws[gid].i[ 1]; - w[ 2] = pws[gid].i[ 2]; - w[ 3] = pws[gid].i[ 3]; - w[ 4] = pws[gid].i[ 4]; - w[ 5] = pws[gid].i[ 5]; - w[ 6] = pws[gid].i[ 6]; - w[ 7] = pws[gid].i[ 7]; - w[ 8] = pws[gid].i[ 8]; - w[ 9] = pws[gid].i[ 9]; - w[10] = pws[gid].i[10]; - w[11] = pws[gid].i[11]; - w[12] = pws[gid].i[12]; - w[13] = pws[gid].i[13]; - w[14] = pws[gid].i[14]; - w[15] = pws[gid].i[15]; - - const u32 pw_len = pws[gid].pw_len; - - /** - * main - */ - - m08600s (s_lotus_magic_table, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); -} diff --git a/src/interface.c b/src/interface.c index 423846406..fd282f864 100644 --- a/src/interface.c +++ b/src/interface.c @@ -24748,7 +24748,7 @@ int hashconfig_init (hashcat_ctx_t *hashcat_ctx) case 8000: hashconfig->pw_max = 30; break; // http://infocenter.sybase.com/help/index.jsp?topic=/com.sybase.infocenter.dc31654.1570/html/sag1/CIHIBDBA.htm case 8200: hashconfig->pw_max = PW_MAX; break; case 8500: hashconfig->pw_max = 8; break; // Underlaying DES max - case 8600: hashconfig->pw_max = 16; break; // Lotus Notes/Domino 5 limits itself to 8 + case 8600: hashconfig->pw_max = 16; break; // Lotus Notes/Domino 5 limits itself to 16 case 8800: hashconfig->pw_max = PW_MAX; break; case 8900: hashconfig->pw_max = PW_MAX; break; case 9100: hashconfig->pw_max = 64; break; // Lotus Notes/Domino 8 limits itself to 64 From 14983a75427a5bfffd02dd42f9657fb14038cbdb Mon Sep 17 00:00:00 2001 From: jsteube Date: Thu, 3 Aug 2017 14:27:53 +0200 Subject: [PATCH 52/75] Simplify RACF kernel --- .../{m08500_a0-optimized.cl => m08500_a0.cl} | 22 +++---------------- .../{m08500_a1-optimized.cl => m08500_a1.cl} | 22 +++---------------- .../{m08500_a3-optimized.cl => m08500_a3.cl} | 22 +++---------------- 3 files changed, 9 insertions(+), 57 deletions(-) rename OpenCL/{m08500_a0-optimized.cl => m08500_a0.cl} (86%) rename OpenCL/{m08500_a1-optimized.cl => m08500_a1.cl} (87%) rename OpenCL/{m08500_a3-optimized.cl => m08500_a3.cl} (87%) diff --git a/OpenCL/m08500_a0-optimized.cl b/OpenCL/m08500_a0.cl similarity index 86% rename from OpenCL/m08500_a0-optimized.cl rename to OpenCL/m08500_a0.cl index 5626b51ce..e81915163 100644 --- a/OpenCL/m08500_a0-optimized.cl +++ b/OpenCL/m08500_a0.cl @@ -3,7 +3,7 @@ * License.....: MIT */ -#define NEW_SIMD_CODE +//#define NEW_SIMD_CODE #include "inc_vendor.cl" #include "inc_hash_constants.h" @@ -521,7 +521,7 @@ void transform_racf_key (const u32x w0, const u32x w1, u32x key[2]) | BOX1 (((w1 >> 24) & 0xff), c_ascii_to_ebcdic_pc) << 24; } -__kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -632,15 +632,7 @@ __kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rule } } -__kernel void m08500_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08500_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08500_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -764,11 +756,3 @@ __kernel void m08500_s04 (__global pw_t *pws, __global const kernel_rule_t *rule COMPARE_S_SIMD (iv[0], iv[1], z, z); } } - -__kernel void m08500_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08500_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} diff --git a/OpenCL/m08500_a1-optimized.cl b/OpenCL/m08500_a1.cl similarity index 87% rename from OpenCL/m08500_a1-optimized.cl rename to OpenCL/m08500_a1.cl index 6b7dca17e..ade03fca5 100644 --- a/OpenCL/m08500_a1-optimized.cl +++ b/OpenCL/m08500_a1.cl @@ -3,7 +3,7 @@ * License.....: MIT */ -#define NEW_SIMD_CODE +//#define NEW_SIMD_CODE #include "inc_vendor.cl" #include "inc_hash_constants.h" @@ -519,7 +519,7 @@ void transform_racf_key (const u32x w0, const u32x w1, u32x key[2]) | BOX1 (((w1 >> 24) & 0xff), c_ascii_to_ebcdic_pc) << 24; } -__kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -673,15 +673,7 @@ __kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rule } } -__kernel void m08500_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08500_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08500_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -846,11 +838,3 @@ __kernel void m08500_s04 (__global pw_t *pws, __global const kernel_rule_t *rule COMPARE_S_SIMD (iv[0], iv[1], z, z); } } - -__kernel void m08500_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08500_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} diff --git a/OpenCL/m08500_a3-optimized.cl b/OpenCL/m08500_a3.cl similarity index 87% rename from OpenCL/m08500_a3-optimized.cl rename to OpenCL/m08500_a3.cl index 8cff7bcc2..385ba2ee4 100644 --- a/OpenCL/m08500_a3-optimized.cl +++ b/OpenCL/m08500_a3.cl @@ -3,7 +3,7 @@ * License.....: MIT */ -#define NEW_SIMD_CODE +//#define NEW_SIMD_CODE #include "inc_vendor.cl" #include "inc_hash_constants.h" @@ -657,7 +657,7 @@ void m08500s (__local u32 (*s_SPtrans)[64], __local u32 (*s_skb)[64], u32 w[16], } } -__kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -731,15 +731,7 @@ __kernel void m08500_m04 (__global pw_t *pws, __global const kernel_rule_t *rule m08500m (s_SPtrans, s_skb, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); } -__kernel void m08500_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08500_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08500_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -812,11 +804,3 @@ __kernel void m08500_s04 (__global pw_t *pws, __global const kernel_rule_t *rule m08500s (s_SPtrans, s_skb, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); } - -__kernel void m08500_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m08500_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} From e0c86f40e53353642db6dc44508302fd63db8767 Mon Sep 17 00:00:00 2001 From: jsteube Date: Thu, 3 Aug 2017 14:33:31 +0200 Subject: [PATCH 53/75] Fix maximum password length supported in Lotus Notes/Domino 6 --- src/interface.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/interface.c b/src/interface.c index fd282f864..157c6593d 100644 --- a/src/interface.c +++ b/src/interface.c @@ -24749,9 +24749,10 @@ int hashconfig_init (hashcat_ctx_t *hashcat_ctx) case 8200: hashconfig->pw_max = PW_MAX; break; case 8500: hashconfig->pw_max = 8; break; // Underlaying DES max case 8600: hashconfig->pw_max = 16; break; // Lotus Notes/Domino 5 limits itself to 16 + case 8700: hashconfig->pw_max = 64; break; // https://www.ibm.com/support/knowledgecenter/en/SSKTWP_8.5.3/com.ibm.notes85.client.doc/fram_limits_of_notes_r.html case 8800: hashconfig->pw_max = PW_MAX; break; case 8900: hashconfig->pw_max = PW_MAX; break; - case 9100: hashconfig->pw_max = 64; break; // Lotus Notes/Domino 8 limits itself to 64 + case 9100: hashconfig->pw_max = 64; break; // https://www.ibm.com/support/knowledgecenter/en/SSKTWP_8.5.3/com.ibm.notes85.client.doc/fram_limits_of_notes_r.html case 9200: hashconfig->pw_max = PW_MAX; break; case 9300: hashconfig->pw_max = PW_MAX; break; case 9400: hashconfig->pw_max = PW_MAX; break; From 37432b19bcb2d3c102d5659f098730710282312f Mon Sep 17 00:00:00 2001 From: jsteube Date: Thu, 3 Aug 2017 14:42:11 +0200 Subject: [PATCH 54/75] Mix in pure kernel functions to MS Office <= 2003 $0/$1, MD5 + RC4 --- OpenCL/m09700_a0-optimized.cl | 99 +---------------------------- OpenCL/m09700_a1-optimized.cl | 99 +---------------------------- OpenCL/m09700_a3-optimized.cl | 99 +---------------------------- OpenCL/m09720_a0-optimized.cl | 115 +++------------------------------- OpenCL/m09720_a1-optimized.cl | 115 +++------------------------------- OpenCL/m09720_a3-optimized.cl | 115 +++------------------------------- 6 files changed, 30 insertions(+), 612 deletions(-) diff --git a/OpenCL/m09700_a0-optimized.cl b/OpenCL/m09700_a0-optimized.cl index 6262cd701..49fc39362 100644 --- a/OpenCL/m09700_a0-optimized.cl +++ b/OpenCL/m09700_a0-optimized.cl @@ -14,6 +14,7 @@ #include "inc_rp.h" #include "inc_rp.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" typedef struct { @@ -138,104 +139,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4 return j; } -void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - void gen336 (u32 digest_pre[4], u32 salt_buf[4], u32 digest[4]) { u32 digest_t0[2]; diff --git a/OpenCL/m09700_a1-optimized.cl b/OpenCL/m09700_a1-optimized.cl index 50dc116e9..5956d9d52 100644 --- a/OpenCL/m09700_a1-optimized.cl +++ b/OpenCL/m09700_a1-optimized.cl @@ -12,6 +12,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" typedef struct { @@ -136,104 +137,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4 return j; } -void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - void gen336 (u32 digest_pre[4], u32 salt_buf[4], u32 digest[4]) { u32 digest_t0[2]; diff --git a/OpenCL/m09700_a3-optimized.cl b/OpenCL/m09700_a3-optimized.cl index 645036a74..306f42dfc 100644 --- a/OpenCL/m09700_a3-optimized.cl +++ b/OpenCL/m09700_a3-optimized.cl @@ -9,6 +9,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" typedef struct { @@ -133,104 +134,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4 return j; } -void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - void m09700m (__local RC4_KEY *rc4_keys, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice01_t *oldoffice01_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { /** diff --git a/OpenCL/m09720_a0-optimized.cl b/OpenCL/m09720_a0-optimized.cl index 587bedb67..a8a0523d2 100644 --- a/OpenCL/m09720_a0-optimized.cl +++ b/OpenCL/m09720_a0-optimized.cl @@ -13,104 +13,7 @@ #include "inc_rp.h" #include "inc_rp.cl" #include "inc_simd.cl" - -void md5_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4]) -{ - u32x a = digest[0]; - u32x b = digest[1]; - u32x c = digest[2]; - u32x d = digest[3]; - - u32x w0_t = w0[0]; - u32x w1_t = w0[1]; - u32x w2_t = w0[2]; - u32x w3_t = w0[3]; - u32x w4_t = w1[0]; - u32x w5_t = w1[1]; - u32x w6_t = w1[2]; - u32x w7_t = w1[3]; - u32x w8_t = w2[0]; - u32x w9_t = w2[1]; - u32x wa_t = w2[2]; - u32x wb_t = w2[3]; - u32x wc_t = w3[0]; - u32x wd_t = w3[1]; - u32x we_t = w3[2]; - u32x wf_t = w3[3]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} +#include "inc_hash_md5.cl" void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) { @@ -220,7 +123,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) w3_t[3] |= digest_t3[0]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = 0; w0_t[1] = 0; @@ -273,7 +176,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) // 62.. w3_t[3] |= digest_t2[0]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = 0; w0_t[1] = 0; @@ -326,7 +229,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) // 61.. w3_t[3] |= digest_t1[0]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = 0; w0_t[1] = 0; @@ -379,7 +282,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) // 60.. w3_t[3] = digest_t0[0]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = 0; w0_t[1] = 0; @@ -434,7 +337,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) w3_t[2] |= digest_t3[0]; w3_t[3] = digest_t3[1]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = salt_buf_t0[0]; w0_t[1] = salt_buf_t0[1]; @@ -453,7 +356,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) w3_t[2] = 21 * 16 * 8; w3_t[3] = 0; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); } __kernel void m09720_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice01_t *oldoffice01_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -529,7 +432,7 @@ __kernel void m09720_m04 (__global pw_t *pws, __global const kernel_rule_t *rule digest_pre[2] = MD5M_C; digest_pre[3] = MD5M_D; - md5_transform (w0, w1, w2, w3, digest_pre); + md5_transform_vector (w0, w1, w2, w3, digest_pre); digest_pre[0] &= 0xffffffff; digest_pre[1] &= 0x000000ff; @@ -647,7 +550,7 @@ __kernel void m09720_s04 (__global pw_t *pws, __global const kernel_rule_t *rule digest_pre[2] = MD5M_C; digest_pre[3] = MD5M_D; - md5_transform (w0, w1, w2, w3, digest_pre); + md5_transform_vector (w0, w1, w2, w3, digest_pre); digest_pre[0] &= 0xffffffff; digest_pre[1] &= 0x000000ff; diff --git a/OpenCL/m09720_a1-optimized.cl b/OpenCL/m09720_a1-optimized.cl index 97f26c391..1e28f738f 100644 --- a/OpenCL/m09720_a1-optimized.cl +++ b/OpenCL/m09720_a1-optimized.cl @@ -11,104 +11,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" - -void md5_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4]) -{ - u32x a = digest[0]; - u32x b = digest[1]; - u32x c = digest[2]; - u32x d = digest[3]; - - u32x w0_t = w0[0]; - u32x w1_t = w0[1]; - u32x w2_t = w0[2]; - u32x w3_t = w0[3]; - u32x w4_t = w1[0]; - u32x w5_t = w1[1]; - u32x w6_t = w1[2]; - u32x w7_t = w1[3]; - u32x w8_t = w2[0]; - u32x w9_t = w2[1]; - u32x wa_t = w2[2]; - u32x wb_t = w2[3]; - u32x wc_t = w3[0]; - u32x wd_t = w3[1]; - u32x we_t = w3[2]; - u32x wf_t = w3[3]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} +#include "inc_hash_md5.cl" void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) { @@ -218,7 +121,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) w3_t[3] |= digest_t3[0]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = 0; w0_t[1] = 0; @@ -271,7 +174,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) // 62.. w3_t[3] |= digest_t2[0]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = 0; w0_t[1] = 0; @@ -324,7 +227,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) // 61.. w3_t[3] |= digest_t1[0]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = 0; w0_t[1] = 0; @@ -377,7 +280,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) // 60.. w3_t[3] = digest_t0[0]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = 0; w0_t[1] = 0; @@ -432,7 +335,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) w3_t[2] |= digest_t3[0]; w3_t[3] = digest_t3[1]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = salt_buf_t0[0]; w0_t[1] = salt_buf_t0[1]; @@ -451,7 +354,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) w3_t[2] = 21 * 16 * 8; w3_t[3] = 0; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); } __kernel void m09720_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice01_t *oldoffice01_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -577,7 +480,7 @@ __kernel void m09720_m04 (__global pw_t *pws, __global const kernel_rule_t *rule digest_pre[2] = MD5M_C; digest_pre[3] = MD5M_D; - md5_transform (w0, w1, w2, w3, digest_pre); + md5_transform_vector (w0, w1, w2, w3, digest_pre); digest_pre[0] &= 0xffffffff; digest_pre[1] &= 0x000000ff; @@ -745,7 +648,7 @@ __kernel void m09720_s04 (__global pw_t *pws, __global const kernel_rule_t *rule digest_pre[2] = MD5M_C; digest_pre[3] = MD5M_D; - md5_transform (w0, w1, w2, w3, digest_pre); + md5_transform_vector (w0, w1, w2, w3, digest_pre); digest_pre[0] &= 0xffffffff; digest_pre[1] &= 0x000000ff; diff --git a/OpenCL/m09720_a3-optimized.cl b/OpenCL/m09720_a3-optimized.cl index 870b324a1..237a3cb9d 100644 --- a/OpenCL/m09720_a3-optimized.cl +++ b/OpenCL/m09720_a3-optimized.cl @@ -11,104 +11,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" - -void md5_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4]) -{ - u32x a = digest[0]; - u32x b = digest[1]; - u32x c = digest[2]; - u32x d = digest[3]; - - u32x w0_t = w0[0]; - u32x w1_t = w0[1]; - u32x w2_t = w0[2]; - u32x w3_t = w0[3]; - u32x w4_t = w1[0]; - u32x w5_t = w1[1]; - u32x w6_t = w1[2]; - u32x w7_t = w1[3]; - u32x w8_t = w2[0]; - u32x w9_t = w2[1]; - u32x wa_t = w2[2]; - u32x wb_t = w2[3]; - u32x wc_t = w3[0]; - u32x wd_t = w3[1]; - u32x we_t = w3[2]; - u32x wf_t = w3[3]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} +#include "inc_hash_md5.cl" void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) { @@ -218,7 +121,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) w3_t[3] |= digest_t3[0]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = 0; w0_t[1] = 0; @@ -271,7 +174,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) // 62.. w3_t[3] |= digest_t2[0]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = 0; w0_t[1] = 0; @@ -324,7 +227,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) // 61.. w3_t[3] |= digest_t1[0]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = 0; w0_t[1] = 0; @@ -377,7 +280,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) // 60.. w3_t[3] = digest_t0[0]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = 0; w0_t[1] = 0; @@ -432,7 +335,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) w3_t[2] |= digest_t3[0]; w3_t[3] = digest_t3[1]; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = salt_buf_t0[0]; w0_t[1] = salt_buf_t0[1]; @@ -451,7 +354,7 @@ void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) w3_t[2] = 21 * 16 * 8; w3_t[3] = 0; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); } void m09720m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice01_t *oldoffice01_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) @@ -519,7 +422,7 @@ void m09720m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digest_pre[2] = MD5M_C; digest_pre[3] = MD5M_D; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest_pre); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest_pre); digest_pre[0] &= 0xffffffff; digest_pre[1] &= 0x000000ff; @@ -621,7 +524,7 @@ void m09720s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digest_pre[2] = MD5M_C; digest_pre[3] = MD5M_D; - md5_transform (w0_t, w1_t, w2_t, w3_t, digest_pre); + md5_transform_vector (w0_t, w1_t, w2_t, w3_t, digest_pre); digest_pre[0] &= 0xffffffff; digest_pre[1] &= 0x000000ff; From a650b0864ef6130425794fd4b0f84b1bfd55e152 Mon Sep 17 00:00:00 2001 From: jsteube Date: Thu, 3 Aug 2017 14:44:09 +0200 Subject: [PATCH 55/75] Mix in pure kernel functions to MS Office <= 2003 $0/$1, MD5 + RC4 --- OpenCL/m09710_a0-optimized.cl | 99 +---------------------------------- OpenCL/m09710_a1-optimized.cl | 99 +---------------------------------- OpenCL/m09710_a3-optimized.cl | 99 +---------------------------------- 3 files changed, 3 insertions(+), 294 deletions(-) diff --git a/OpenCL/m09710_a0-optimized.cl b/OpenCL/m09710_a0-optimized.cl index b0676599d..4b72fcf7d 100644 --- a/OpenCL/m09710_a0-optimized.cl +++ b/OpenCL/m09710_a0-optimized.cl @@ -14,6 +14,7 @@ #include "inc_rp.h" #include "inc_rp.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" typedef struct { @@ -138,104 +139,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4 return j; } -void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - __kernel void m09710_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice01_t *oldoffice01_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** diff --git a/OpenCL/m09710_a1-optimized.cl b/OpenCL/m09710_a1-optimized.cl index 9aa5bc540..6bbfb702f 100644 --- a/OpenCL/m09710_a1-optimized.cl +++ b/OpenCL/m09710_a1-optimized.cl @@ -12,6 +12,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" typedef struct { @@ -136,104 +137,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4 return j; } -void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - __kernel void m09710_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice01_t *oldoffice01_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** diff --git a/OpenCL/m09710_a3-optimized.cl b/OpenCL/m09710_a3-optimized.cl index 3333fadb9..5b50fefa7 100644 --- a/OpenCL/m09710_a3-optimized.cl +++ b/OpenCL/m09710_a3-optimized.cl @@ -12,6 +12,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" typedef struct { @@ -136,104 +137,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4 return j; } -void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - void m09710m (__local RC4_KEY *rc4_keys, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice01_t *oldoffice01_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { /** From 3a042972a4d145464e73a0eac4e1d0441175ebb3 Mon Sep 17 00:00:00 2001 From: jsteube Date: Thu, 3 Aug 2017 14:52:09 +0200 Subject: [PATCH 56/75] Mix in pure kernel functions to MS Office <= 2003 $3/$4, SHA1 + RC4 --- OpenCL/m09800_a0-optimized.cl | 129 +------------------------------- OpenCL/m09800_a1-optimized.cl | 129 +------------------------------- OpenCL/m09800_a3-optimized.cl | 129 +------------------------------- OpenCL/m09810_a0-optimized.cl | 129 +------------------------------- OpenCL/m09810_a1-optimized.cl | 129 +------------------------------- OpenCL/m09810_a3-optimized.cl | 129 +------------------------------- OpenCL/m09820_a0-optimized.cl | 137 ++-------------------------------- OpenCL/m09820_a1-optimized.cl | 137 ++-------------------------------- OpenCL/m09820_a3-optimized.cl | 137 ++-------------------------------- 9 files changed, 21 insertions(+), 1164 deletions(-) diff --git a/OpenCL/m09800_a0-optimized.cl b/OpenCL/m09800_a0-optimized.cl index f8a70ca7a..2c26ce3be 100644 --- a/OpenCL/m09800_a0-optimized.cl +++ b/OpenCL/m09800_a0-optimized.cl @@ -14,6 +14,7 @@ #include "inc_rp.h" #include "inc_rp.cl" #include "inc_simd.cl" +#include "inc_hash_sha1.cl" typedef struct { @@ -138,134 +139,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4 return j; } -void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) -{ - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} - __kernel void m09800_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice34_t *oldoffice34_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** diff --git a/OpenCL/m09800_a1-optimized.cl b/OpenCL/m09800_a1-optimized.cl index 0d0a44811..5064b8c61 100644 --- a/OpenCL/m09800_a1-optimized.cl +++ b/OpenCL/m09800_a1-optimized.cl @@ -12,6 +12,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_sha1.cl" typedef struct { @@ -136,134 +137,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4 return j; } -void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) -{ - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} - __kernel void m09800_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice34_t *oldoffice34_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** diff --git a/OpenCL/m09800_a3-optimized.cl b/OpenCL/m09800_a3-optimized.cl index 6392d5b6b..ffa1df0ec 100644 --- a/OpenCL/m09800_a3-optimized.cl +++ b/OpenCL/m09800_a3-optimized.cl @@ -9,6 +9,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_sha1.cl" typedef struct { @@ -133,134 +134,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4 return j; } -void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) -{ - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} - void m09800m (__local RC4_KEY *rc4_keys, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice34_t *oldoffice34_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { /** diff --git a/OpenCL/m09810_a0-optimized.cl b/OpenCL/m09810_a0-optimized.cl index 92f7c8a7a..8ee19da53 100644 --- a/OpenCL/m09810_a0-optimized.cl +++ b/OpenCL/m09810_a0-optimized.cl @@ -14,6 +14,7 @@ #include "inc_rp.h" #include "inc_rp.cl" #include "inc_simd.cl" +#include "inc_hash_sha1.cl" typedef struct { @@ -138,134 +139,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4 return j; } -void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) -{ - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} - __kernel void m09810_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice34_t *oldoffice34_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** diff --git a/OpenCL/m09810_a1-optimized.cl b/OpenCL/m09810_a1-optimized.cl index 7eb8619be..7d08a70d5 100644 --- a/OpenCL/m09810_a1-optimized.cl +++ b/OpenCL/m09810_a1-optimized.cl @@ -12,6 +12,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_sha1.cl" typedef struct { @@ -136,134 +137,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4 return j; } -void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) -{ - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} - __kernel void m09810_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice34_t *oldoffice34_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** diff --git a/OpenCL/m09810_a3-optimized.cl b/OpenCL/m09810_a3-optimized.cl index b2db6890c..fc8733b50 100644 --- a/OpenCL/m09810_a3-optimized.cl +++ b/OpenCL/m09810_a3-optimized.cl @@ -12,6 +12,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_sha1.cl" typedef struct { @@ -136,134 +137,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4 return j; } -void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) -{ - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} - void m09810m (__local RC4_KEY *rc4_keys, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice34_t *oldoffice34_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { /** diff --git a/OpenCL/m09820_a0-optimized.cl b/OpenCL/m09820_a0-optimized.cl index 367d2a32a..f895d7939 100644 --- a/OpenCL/m09820_a0-optimized.cl +++ b/OpenCL/m09820_a0-optimized.cl @@ -13,134 +13,7 @@ #include "inc_rp.h" #include "inc_rp.cl" #include "inc_simd.cl" - -void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5]) -{ - u32x A = digest[0]; - u32x B = digest[1]; - u32x C = digest[2]; - u32x D = digest[3]; - u32x E = digest[4]; - - u32x w0_t = w0[0]; - u32x w1_t = w0[1]; - u32x w2_t = w0[2]; - u32x w3_t = w0[3]; - u32x w4_t = w1[0]; - u32x w5_t = w1[1]; - u32x w6_t = w1[2]; - u32x w7_t = w1[3]; - u32x w8_t = w2[0]; - u32x w9_t = w2[1]; - u32x wa_t = w2[2]; - u32x wb_t = w2[3]; - u32x wc_t = w3[0]; - u32x wd_t = w3[1]; - u32x we_t = w3[2]; - u32x wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} +#include "inc_hash_sha1.cl" __kernel void m09820_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice34_t *oldoffice34_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -232,7 +105,7 @@ __kernel void m09820_m04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0, w1, w2, w3, digest); + sha1_transform_vector (w0, w1, w2, w3, digest); w0[0] = digest[0]; w0[1] = digest[1]; @@ -257,7 +130,7 @@ __kernel void m09820_m04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0, w1, w2, w3, digest); + sha1_transform_vector (w0, w1, w2, w3, digest); digest[0] = swap32 (digest[0]); digest[1] = swap32 (digest[1]) & 0xff; @@ -378,7 +251,7 @@ __kernel void m09820_s04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0, w1, w2, w3, digest); + sha1_transform_vector (w0, w1, w2, w3, digest); w0[0] = digest[0]; w0[1] = digest[1]; @@ -403,7 +276,7 @@ __kernel void m09820_s04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0, w1, w2, w3, digest); + sha1_transform_vector (w0, w1, w2, w3, digest); digest[0] = swap32 (digest[0]); digest[1] = swap32 (digest[1]) & 0xff; diff --git a/OpenCL/m09820_a1-optimized.cl b/OpenCL/m09820_a1-optimized.cl index 982cae319..113af54b0 100644 --- a/OpenCL/m09820_a1-optimized.cl +++ b/OpenCL/m09820_a1-optimized.cl @@ -11,134 +11,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" - -void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5]) -{ - u32x A = digest[0]; - u32x B = digest[1]; - u32x C = digest[2]; - u32x D = digest[3]; - u32x E = digest[4]; - - u32x w0_t = w0[0]; - u32x w1_t = w0[1]; - u32x w2_t = w0[2]; - u32x w3_t = w0[3]; - u32x w4_t = w1[0]; - u32x w5_t = w1[1]; - u32x w6_t = w1[2]; - u32x w7_t = w1[3]; - u32x w8_t = w2[0]; - u32x w9_t = w2[1]; - u32x wa_t = w2[2]; - u32x wb_t = w2[3]; - u32x wc_t = w3[0]; - u32x wd_t = w3[1]; - u32x we_t = w3[2]; - u32x wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} +#include "inc_hash_sha1.cl" __kernel void m09820_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice34_t *oldoffice34_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { @@ -280,7 +153,7 @@ __kernel void m09820_m04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0, w1, w2, w3, digest); + sha1_transform_vector (w0, w1, w2, w3, digest); w0[0] = digest[0]; w0[1] = digest[1]; @@ -305,7 +178,7 @@ __kernel void m09820_m04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0, w1, w2, w3, digest); + sha1_transform_vector (w0, w1, w2, w3, digest); digest[0] = swap32 (digest[0]); digest[1] = swap32 (digest[1]) & 0xff; @@ -476,7 +349,7 @@ __kernel void m09820_s04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0, w1, w2, w3, digest); + sha1_transform_vector (w0, w1, w2, w3, digest); w0[0] = digest[0]; w0[1] = digest[1]; @@ -501,7 +374,7 @@ __kernel void m09820_s04 (__global pw_t *pws, __global const kernel_rule_t *rule digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0, w1, w2, w3, digest); + sha1_transform_vector (w0, w1, w2, w3, digest); digest[0] = swap32 (digest[0]); digest[1] = swap32 (digest[1]) & 0xff; diff --git a/OpenCL/m09820_a3-optimized.cl b/OpenCL/m09820_a3-optimized.cl index 34472dc8b..a63d26aa5 100644 --- a/OpenCL/m09820_a3-optimized.cl +++ b/OpenCL/m09820_a3-optimized.cl @@ -11,134 +11,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" - -void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5]) -{ - u32x A = digest[0]; - u32x B = digest[1]; - u32x C = digest[2]; - u32x D = digest[3]; - u32x E = digest[4]; - - u32x w0_t = w0[0]; - u32x w1_t = w0[1]; - u32x w2_t = w0[2]; - u32x w3_t = w0[3]; - u32x w4_t = w1[0]; - u32x w5_t = w1[1]; - u32x w6_t = w1[2]; - u32x w7_t = w1[3]; - u32x w8_t = w2[0]; - u32x w9_t = w2[1]; - u32x wa_t = w2[2]; - u32x wb_t = w2[3]; - u32x wc_t = w3[0]; - u32x wd_t = w3[1]; - u32x we_t = w3[2]; - u32x wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} +#include "inc_hash_sha1.cl" void m09820m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global oldoffice34_t *oldoffice34_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { @@ -206,7 +79,7 @@ void m09820m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = digest[0]; w0_t[1] = digest[1]; @@ -231,7 +104,7 @@ void m09820m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); digest[0] = swap32 (digest[0]); digest[1] = swap32 (digest[1]) & 0xff; @@ -320,7 +193,7 @@ void m09820s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = digest[0]; w0_t[1] = digest[1]; @@ -345,7 +218,7 @@ void m09820s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digest[3] = SHA1M_D; digest[4] = SHA1M_E; - sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); + sha1_transform_vector (w0_t, w1_t, w2_t, w3_t, digest); digest[0] = swap32 (digest[0]); digest[1] = swap32 (digest[1]) & 0xff; From a9375b9817c39d2972a74807c9624736f891ecae Mon Sep 17 00:00:00 2001 From: jsteube Date: Thu, 3 Aug 2017 15:08:08 +0200 Subject: [PATCH 57/75] Fix maximum password length supported in MS Office <= 2003 --- src/interface.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/interface.c b/src/interface.c index 157c6593d..fd22a3c8d 100644 --- a/src/interface.c +++ b/src/interface.c @@ -24704,9 +24704,9 @@ int hashconfig_init (hashcat_ctx_t *hashcat_ctx) } // pw_max : all modes listed in the following switch cases are - // the maximum possible password length by the related system - // plus the opencl kernels support to crack them without -L set by the user - // however, some modes have a self-set and some have + // the maximum possible password length of the related system + // plus the opencl kernels which eventually allows cracking of passwords of up length PW_MAX for free (no speed drop). + // some modes have a self-set and some have // underlaying algorithms specific hard maximum password length // these limits override all previous restrictions, always @@ -24716,7 +24716,7 @@ int hashconfig_init (hashcat_ctx_t *hashcat_ctx) case 1500: hashconfig->pw_max = 8; break; // Underlaying DES max case 2100: hashconfig->pw_max = PW_MAX; break; case 2400: hashconfig->pw_max = 16; break; // Cisco-PIX MD5 sets w[4] = 0x80 - case 2410: hashconfig->pw_max = 12; break; // Cisco-ASA MD5 sets w[4] = 0x80 and has a 4 byte fixed salt + case 2410: hashconfig->pw_max = 12; break; // Cisco-ASA MD5 sets w[4] = 0x80 plus has a 4 byte fixed salt case 2500: hashconfig->pw_max = 63; break; // WPA/WPA2 limits itself to 63 by RFC case 2501: hashconfig->pw_max = 64; break; // WPA/WPA2 PMK fixed length case 3000: hashconfig->pw_max = 7; break; // LM max @@ -24758,8 +24758,12 @@ int hashconfig_init (hashcat_ctx_t *hashcat_ctx) case 9400: hashconfig->pw_max = PW_MAX; break; case 9500: hashconfig->pw_max = PW_MAX; break; case 9600: hashconfig->pw_max = PW_MAX; break; + case 9700: hashconfig->pw_max = 15; break; // https://msdn.microsoft.com/en-us/library/dd772916(v=office.12).aspx case 9710: hashconfig->pw_max = 5; break; // Underlaying RC4-40 max + case 9720: hashconfig->pw_max = 15; break; // https://msdn.microsoft.com/en-us/library/dd772916(v=office.12).aspx + case 9800: hashconfig->pw_max = 15; break; // https://msdn.microsoft.com/en-us/library/dd772916(v=office.12).aspx case 9810: hashconfig->pw_max = 5; break; // Underlaying RC4-40 max + case 9820: hashconfig->pw_max = 15; break; // https://msdn.microsoft.com/en-us/library/dd772916(v=office.12).aspx case 10000: hashconfig->pw_max = PW_MAX; break; case 10300: hashconfig->pw_max = 40; break; // https://www.daniel-berlin.de/security/sap-sec/password-hash-algorithms/ case 10400: hashconfig->pw_max = 32; break; // https://www.pdflib.com/knowledge-base/pdf-password-security/encryption/ From 177800d1d0ba11900f6139715b0afbedb5801b53 Mon Sep 17 00:00:00 2001 From: jsteube Date: Thu, 3 Aug 2017 15:21:39 +0200 Subject: [PATCH 58/75] Add pure kernels for RAdmin2 --- OpenCL/m09900_a0.cl | 134 +++++++++++++++++++++++++++++++++++++++++ OpenCL/m09900_a1.cl | 110 +++++++++++++++++++++++++++++++++ OpenCL/m09900_a3.cl | 144 ++++++++++++++++++++++++++++++++++++++++++++ src/interface.c | 1 + 4 files changed, 389 insertions(+) create mode 100644 OpenCL/m09900_a0.cl create mode 100644 OpenCL/m09900_a1.cl create mode 100644 OpenCL/m09900_a3.cl diff --git a/OpenCL/m09900_a0.cl b/OpenCL/m09900_a0.cl new file mode 100644 index 000000000..ff06f126e --- /dev/null +++ b/OpenCL/m09900_a0.cl @@ -0,0 +1,134 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" + +__kernel void m09900_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_ctx_t ctx; + + md5_init (&ctx); + + md5_update (&ctx, w, 100); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m09900_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_ctx_t ctx; + + md5_init (&ctx); + + md5_update (&ctx, w, 100); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m09900_a1.cl b/OpenCL/m09900_a1.cl new file mode 100644 index 000000000..6ea1e9497 --- /dev/null +++ b/OpenCL/m09900_a1.cl @@ -0,0 +1,110 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" + +__kernel void m09900_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md5_ctx_t ctx = ctx0; + + md5_update_global (&ctx, combs_buf[il_pos].i, 100 - pws[gid].pw_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m09900_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md5_ctx_t ctx = ctx0; + + md5_update_global (&ctx, combs_buf[il_pos].i, 100 - pws[gid].pw_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m09900_a3.cl b/OpenCL/m09900_a3.cl new file mode 100644 index 000000000..43c464e78 --- /dev/null +++ b/OpenCL/m09900_a3.cl @@ -0,0 +1,144 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_md5.cl" + +__kernel void m09900_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md5_ctx_vector_t ctx; + + md5_init_vector (&ctx); + + md5_update_vector (&ctx, w, 100); + + md5_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m09900_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md5_ctx_vector_t ctx; + + md5_init_vector (&ctx); + + md5_update_vector (&ctx, w, 100); + + md5_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} diff --git a/src/interface.c b/src/interface.c index fd22a3c8d..9dbb795b2 100644 --- a/src/interface.c +++ b/src/interface.c @@ -24764,6 +24764,7 @@ int hashconfig_init (hashcat_ctx_t *hashcat_ctx) case 9800: hashconfig->pw_max = 15; break; // https://msdn.microsoft.com/en-us/library/dd772916(v=office.12).aspx case 9810: hashconfig->pw_max = 5; break; // Underlaying RC4-40 max case 9820: hashconfig->pw_max = 15; break; // https://msdn.microsoft.com/en-us/library/dd772916(v=office.12).aspx + case 9900: hashconfig->pw_max = 100; break; // RAdmin2 sets w[25] = 0x80 case 10000: hashconfig->pw_max = PW_MAX; break; case 10300: hashconfig->pw_max = 40; break; // https://www.daniel-berlin.de/security/sap-sec/password-hash-algorithms/ case 10400: hashconfig->pw_max = 32; break; // https://www.pdflib.com/knowledge-base/pdf-password-security/encryption/ From bc9f721dcdf95ddf16f21e364d951031a2b8e4f0 Mon Sep 17 00:00:00 2001 From: jsteube Date: Thu, 3 Aug 2017 20:30:46 +0200 Subject: [PATCH 59/75] Mix in pure kernel functions to PDF 1.1 - 1.3 (Acrobat 2 - 4) --- OpenCL/m10400_a0-optimized.cl | 99 +-------------------------- OpenCL/m10400_a1-optimized.cl | 99 +-------------------------- OpenCL/m10400_a3-optimized.cl | 99 +-------------------------- OpenCL/m10410_a0-optimized.cl | 1 + OpenCL/m10410_a1-optimized.cl | 1 + OpenCL/m10410_a3-optimized.cl | 1 + OpenCL/m10420_a0-optimized.cl | 101 +-------------------------- OpenCL/m10420_a1-optimized.cl | 101 +-------------------------- OpenCL/m10420_a3-optimized.cl | 125 ++++------------------------------ 9 files changed, 24 insertions(+), 603 deletions(-) diff --git a/OpenCL/m10400_a0-optimized.cl b/OpenCL/m10400_a0-optimized.cl index 9c1fae1f3..f93a6ad62 100644 --- a/OpenCL/m10400_a0-optimized.cl +++ b/OpenCL/m10400_a0-optimized.cl @@ -14,6 +14,7 @@ #include "inc_rp.h" #include "inc_rp.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" __constant u32 padding[8] = { @@ -135,104 +136,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, __constant u32 *in, u32 ou return j; } -void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - __kernel void m10400_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global pdf_t *pdf_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** diff --git a/OpenCL/m10400_a1-optimized.cl b/OpenCL/m10400_a1-optimized.cl index cbcdf37d5..3f3a1f43b 100644 --- a/OpenCL/m10400_a1-optimized.cl +++ b/OpenCL/m10400_a1-optimized.cl @@ -12,6 +12,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" __constant u32 padding[8] = { @@ -133,104 +134,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, __constant u32 *in, u32 ou return j; } -void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - __kernel void m10400_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global pdf_t *pdf_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** diff --git a/OpenCL/m10400_a3-optimized.cl b/OpenCL/m10400_a3-optimized.cl index c1205fe5f..e4b04ee67 100644 --- a/OpenCL/m10400_a3-optimized.cl +++ b/OpenCL/m10400_a3-optimized.cl @@ -12,6 +12,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" __constant u32 padding[8] = { @@ -133,104 +134,6 @@ u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, __constant u32 *in, u32 ou return j; } -void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - void m10400m (__local RC4_KEY *rc4_keys, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global pdf_t *pdf_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { /** diff --git a/OpenCL/m10410_a0-optimized.cl b/OpenCL/m10410_a0-optimized.cl index 064a5d42d..c96e3b6d2 100644 --- a/OpenCL/m10410_a0-optimized.cl +++ b/OpenCL/m10410_a0-optimized.cl @@ -14,6 +14,7 @@ #include "inc_rp.h" #include "inc_rp.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" __constant u32 padding[8] = { diff --git a/OpenCL/m10410_a1-optimized.cl b/OpenCL/m10410_a1-optimized.cl index ac85ca10b..ea23844cc 100644 --- a/OpenCL/m10410_a1-optimized.cl +++ b/OpenCL/m10410_a1-optimized.cl @@ -12,6 +12,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" __constant u32 padding[8] = { diff --git a/OpenCL/m10410_a3-optimized.cl b/OpenCL/m10410_a3-optimized.cl index db10bc257..46ad89173 100644 --- a/OpenCL/m10410_a3-optimized.cl +++ b/OpenCL/m10410_a3-optimized.cl @@ -12,6 +12,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" __constant u32 padding[8] = { diff --git a/OpenCL/m10420_a0-optimized.cl b/OpenCL/m10420_a0-optimized.cl index 9eb143638..7295e2741 100644 --- a/OpenCL/m10420_a0-optimized.cl +++ b/OpenCL/m10420_a0-optimized.cl @@ -3,7 +3,7 @@ * License.....: MIT */ -#define NEW_SIMD_CODE +//#define NEW_SIMD_CODE #include "inc_vendor.cl" #include "inc_hash_constants.h" @@ -13,6 +13,7 @@ #include "inc_rp.h" #include "inc_rp.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" __constant u32a padding[8] = { @@ -26,104 +27,6 @@ __constant u32a padding[8] = 0x7a695364 }; -void md5_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4]) -{ - u32x a = digest[0]; - u32x b = digest[1]; - u32x c = digest[2]; - u32x d = digest[3]; - - u32x w0_t = w0[0]; - u32x w1_t = w0[1]; - u32x w2_t = w0[2]; - u32x w3_t = w0[3]; - u32x w4_t = w1[0]; - u32x w5_t = w1[1]; - u32x w6_t = w1[2]; - u32x w7_t = w1[3]; - u32x w8_t = w2[0]; - u32x w9_t = w2[1]; - u32x wa_t = w2[2]; - u32x wb_t = w2[3]; - u32x wc_t = w3[0]; - u32x wd_t = w3[1]; - u32x we_t = w3[2]; - u32x wf_t = w3[3]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - __kernel void m10420_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global pdf_t *pdf_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** diff --git a/OpenCL/m10420_a1-optimized.cl b/OpenCL/m10420_a1-optimized.cl index bf016c57c..1e64ca279 100644 --- a/OpenCL/m10420_a1-optimized.cl +++ b/OpenCL/m10420_a1-optimized.cl @@ -3,7 +3,7 @@ * License.....: MIT */ -#define NEW_SIMD_CODE +//#define NEW_SIMD_CODE #include "inc_vendor.cl" #include "inc_hash_constants.h" @@ -11,6 +11,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" __constant u32a padding[8] = { @@ -24,104 +25,6 @@ __constant u32a padding[8] = 0x7a695364 }; -void md5_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4]) -{ - u32x a = digest[0]; - u32x b = digest[1]; - u32x c = digest[2]; - u32x d = digest[3]; - - u32x w0_t = w0[0]; - u32x w1_t = w0[1]; - u32x w2_t = w0[2]; - u32x w3_t = w0[3]; - u32x w4_t = w1[0]; - u32x w5_t = w1[1]; - u32x w6_t = w1[2]; - u32x w7_t = w1[3]; - u32x w8_t = w2[0]; - u32x w9_t = w2[1]; - u32x wa_t = w2[2]; - u32x wb_t = w2[3]; - u32x wc_t = w3[0]; - u32x wd_t = w3[1]; - u32x we_t = w3[2]; - u32x wf_t = w3[3]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - __kernel void m10420_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global pdf_t *pdf_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** diff --git a/OpenCL/m10420_a3-optimized.cl b/OpenCL/m10420_a3-optimized.cl index 0c1b76e5b..9df3068d8 100644 --- a/OpenCL/m10420_a3-optimized.cl +++ b/OpenCL/m10420_a3-optimized.cl @@ -3,7 +3,7 @@ * License.....: MIT */ -#define NEW_SIMD_CODE +//#define NEW_SIMD_CODE #include "inc_vendor.cl" #include "inc_hash_constants.h" @@ -11,6 +11,7 @@ #include "inc_types.cl" #include "inc_common.cl" #include "inc_simd.cl" +#include "inc_hash_md5.cl" __constant u32a padding[8] = { @@ -24,104 +25,6 @@ __constant u32a padding[8] = 0x7a695364 }; -void md5_transform_S (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - MD5_STEP_S (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP_S (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP_S (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP_S (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP_S (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP_S (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP_S (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP_S (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP_S (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP_S (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP_S (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP_S (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP_S (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP_S (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP_S (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP_S (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP_S (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP_S (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP_S (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP_S (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP_S (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP_S (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP_S (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP_S (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP_S (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP_S (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP_S (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP_S (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP_S (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP_S (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP_S (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP_S (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP_S (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP_S (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP_S (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP_S (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP_S (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP_S (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP_S (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP_S (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP_S (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP_S (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP_S (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP_S (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP_S (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP_S (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP_S (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP_S (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP_S (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP_S (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP_S (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP_S (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP_S (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP_S (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP_S (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP_S (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP_S (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP_S (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP_S (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP_S (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP_S (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP_S (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP_S (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP_S (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - void m10420m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global pdf_t *pdf_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { /** @@ -245,7 +148,7 @@ void m10420m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digest[2] = MD5M_C; digest[3] = MD5M_D; - md5_transform_S (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = P; w0_t[1] = id_buf[0]; @@ -264,12 +167,12 @@ void m10420m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl w3_t[2] = 84 * 8; w3_t[3] = 0; - md5_transform_S (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform (w0_t, w1_t, w2_t, w3_t, digest); - u32x a = digest[0]; - u32x b = digest[1] & 0xff; - u32x c = 0; - u32x d = 0; + u32 a = digest[0]; + u32 b = digest[1] & 0xff; + u32 c = 0; + u32 d = 0; COMPARE_M_SIMD (a, b, c, d); } @@ -410,7 +313,7 @@ void m10420s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl digest[2] = MD5M_C; digest[3] = MD5M_D; - md5_transform_S (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = P; w0_t[1] = id_buf[0]; @@ -429,12 +332,12 @@ void m10420s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl w3_t[2] = 84 * 8; w3_t[3] = 0; - md5_transform_S (w0_t, w1_t, w2_t, w3_t, digest); + md5_transform (w0_t, w1_t, w2_t, w3_t, digest); - u32x a = digest[0]; - u32x b = digest[1] & 0xff; - u32x c = 0; - u32x d = 0; + u32 a = digest[0]; + u32 b = digest[1] & 0xff; + u32 c = 0; + u32 d = 0; COMPARE_S_SIMD (a, b, c, d); } From c9cae1f6634d813ff1abced6cd027f90e65c617e Mon Sep 17 00:00:00 2001 From: jsteube Date: Fri, 4 Aug 2017 13:03:54 +0200 Subject: [PATCH 60/75] Add pure kernels for PrestaShop --- OpenCL/m11000_a0.cl | 142 ++++++++++++++++++++++++++++++++++++++++ OpenCL/m11000_a1.cl | 114 ++++++++++++++++++++++++++++++++ OpenCL/m11000_a3.cl | 156 ++++++++++++++++++++++++++++++++++++++++++++ tools/test.sh | 2 +- 4 files changed, 413 insertions(+), 1 deletion(-) create mode 100644 OpenCL/m11000_a0.cl create mode 100644 OpenCL/m11000_a1.cl create mode 100644 OpenCL/m11000_a3.cl diff --git a/OpenCL/m11000_a0.cl b/OpenCL/m11000_a0.cl new file mode 100644 index 000000000..6449f5692 --- /dev/null +++ b/OpenCL/m11000_a0.cl @@ -0,0 +1,142 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" + +__kernel void m11000_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_ctx_t ctx = ctx0; + + md5_update (&ctx, w, pw_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m11000_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_ctx_t ctx = ctx0; + + md5_update (&ctx, w, pw_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m11000_a1.cl b/OpenCL/m11000_a1.cl new file mode 100644 index 000000000..291d1c783 --- /dev/null +++ b/OpenCL/m11000_a1.cl @@ -0,0 +1,114 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" + +__kernel void m11000_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + md5_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md5_ctx_t ctx = ctx0; + + md5_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m11000_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + md5_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md5_ctx_t ctx = ctx0; + + md5_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m11000_a3.cl b/OpenCL/m11000_a3.cl new file mode 100644 index 000000000..5c4788266 --- /dev/null +++ b/OpenCL/m11000_a3.cl @@ -0,0 +1,156 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_md5.cl" + +__kernel void m11000_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md5_ctx_vector_t ctx; + + md5_init_vector_from_scalar (&ctx, &ctx0); + + md5_update_vector (&ctx, w, pw_len); + + md5_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m11000_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md5_ctx_vector_t ctx; + + md5_init_vector_from_scalar (&ctx, &ctx0); + + md5_update_vector (&ctx, w, pw_len); + + md5_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} diff --git a/tools/test.sh b/tools/test.sh index c753e30bc..090c09eca 100755 --- a/tools/test.sh +++ b/tools/test.sh @@ -24,7 +24,7 @@ NEVER_CRACK="11600 14900" SLOW_ALGOS="400 500 501 1600 1800 2100 2500 3200 5200 5800 6211 6212 6213 6221 6222 6223 6231 6232 6233 6241 6242 6243 6251 6261 6271 6281 6300 6400 6500 6600 6700 6800 7100 7200 7400 7900 8200 8800 8900 9000 9100 9200 9300 9400 9500 9600 10000 10300 10500 10700 10900 11300 11600 11900 12000 12001 12100 12200 12300 12400 12500 12700 12800 12900 13000 13200 13400 13600 14600 14700 14800 15100 15200 15300 15600 15700 15800" -OPTS="--quiet --force --potfile-disable --runtime 400 --gpu-temp-disable --weak-hash-threshold=0 -d 1" +OPTS="--quiet --force --potfile-disable --runtime 400 --gpu-temp-disable --weak-hash-threshold=0 " OUTD="test_$(date +%s)" From c8a645dc1698ab568d3381bc679414f054376fb4 Mon Sep 17 00:00:00 2001 From: jsteube Date: Fri, 4 Aug 2017 14:12:37 +0200 Subject: [PATCH 61/75] Fix invalid default salt length for mode 11000 in benchmark --- src/interface.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/interface.c b/src/interface.c index 9dbb795b2..6d195b58d 100644 --- a/src/interface.c +++ b/src/interface.c @@ -25111,6 +25111,8 @@ void hashconfig_benchmark_defaults (hashcat_ctx_t *hashcat_ctx, salt_t *salt, vo break; case 10300: salt->salt_len = 12; break; + case 11000: salt->salt_len = 56; + break; case 11500: salt->salt_len = 4; break; case 11600: salt->salt_len = 4; From a53d9e09dea343c4c06665d68db8429786a72df7 Mon Sep 17 00:00:00 2001 From: jsteube Date: Fri, 4 Aug 2017 14:12:58 +0200 Subject: [PATCH 62/75] Fix some issue with offset_minus_4 --- OpenCL/inc_common.cl | 34 ++++++----------- OpenCL/inc_rp.cl | 6 +-- OpenCL/m00500-optimized.cl | 60 +++++++++++------------------- OpenCL/m01600-optimized.cl | 60 +++++++++++------------------- OpenCL/m05800-optimized.cl | 16 +++----- OpenCL/m05800.cl | 16 +++----- OpenCL/m06300-optimized.cl | 60 +++++++++++------------------- OpenCL/m07400-optimized.cl | 76 ++++++++++++++++---------------------- 8 files changed, 119 insertions(+), 209 deletions(-) diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl index f50e073f9..4673a5bc4 100644 --- a/OpenCL/inc_common.cl +++ b/OpenCL/inc_common.cl @@ -2939,11 +2939,11 @@ void append_0x80_1x16 (u32x w[16], const u32 offset) void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) { - #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; - const int offset_minus_4 = 4 - offset; + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) { case 0: @@ -3469,8 +3469,6 @@ void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (offset % 4); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; switch (offset / 4) @@ -3802,7 +3800,7 @@ void switch_buffer_by_offset_carry_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x { const int offset_mod_4 = offset & 3; - const int offset_minus_4 = 4 - offset; + const int offset_minus_4 = 4 - offset_mod_4; switch (offset / 4) { @@ -6184,11 +6182,11 @@ void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) { - #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; - const int offset_minus_4 = 4 - offset; + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) { case 0: @@ -7226,8 +7224,6 @@ void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3 #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (offset % 4); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; switch (offset / 4) @@ -15044,11 +15040,11 @@ void undo_utf16le_S (const u32 in1[4], const u32 in2[4], u32 out[4]) void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) { - #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; - const int offset_minus_4 = 4 - offset; + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) { case 0: @@ -15574,8 +15570,6 @@ void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], c #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (offset % 4); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; switch (offset / 4) @@ -15907,7 +15901,7 @@ void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3 { const int offset_mod_4 = offset & 3; - const int offset_minus_4 = 4 - offset; + const int offset_minus_4 = 4 - offset_mod_4; switch (offset / 4) { @@ -18289,11 +18283,11 @@ void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3 void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) { - #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; - const int offset_minus_4 = 4 - offset; + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) { case 0: @@ -19331,8 +19325,6 @@ void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4 #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (offset % 4); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; switch (offset / 4) @@ -25596,11 +25588,11 @@ void switch_buffer_by_offset_8x4_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u3 void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) { - #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; - const int offset_minus_4 = 4 - offset; + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) { case 0: @@ -32294,8 +32286,6 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (offset % 4); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; switch (offset / 4) diff --git a/OpenCL/inc_rp.cl b/OpenCL/inc_rp.cl index c50ec4a67..170ec5385 100644 --- a/OpenCL/inc_rp.cl +++ b/OpenCL/inc_rp.cl @@ -756,11 +756,11 @@ void append_block1 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_r0 void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0[4], const u32 src_l1[4], const u32 src_r0[4], const u32 src_r1[4]) { - #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; - const int offset_minus_4 = 4 - offset; + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC u32 s0 = 0; u32 s1 = 0; u32 s2 = 0; @@ -895,8 +895,6 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0 #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (offset % 4); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; u32 s0 = 0; diff --git a/OpenCL/m00500-optimized.cl b/OpenCL/m00500-optimized.cl index 81cd4f8bf..cb89b0b7b 100644 --- a/OpenCL/m00500-optimized.cl +++ b/OpenCL/m00500-optimized.cl @@ -114,7 +114,7 @@ void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 digest[3] += d; } -void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[4]) +void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[4]) { u32 tmp0; u32 tmp1; @@ -122,19 +122,18 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 tmp3; u32 tmp4; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC - - const int offset_minus_4 = 4 - (block_len & 3); - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); - const u32 mod = block_len & 3; - - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; @@ -142,13 +141,9 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const tmp3 = tmp4; tmp4 = 0; } - #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); @@ -156,10 +151,9 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const tmp2 = __byte_perm (append[1], append[2], selector); tmp3 = __byte_perm (append[2], append[3], selector); tmp4 = __byte_perm (append[3], 0, selector); - #endif - const u32 div = block_len / 4; + const u32 div = offset / 4; switch (div) { @@ -226,7 +220,7 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const } } -void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[4]) +void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[4]) { u32 tmp0; u32 tmp1; @@ -234,19 +228,18 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c u32 tmp3; u32 tmp4; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC - - const int offset_minus_4 = 4 - (block_len & 3); - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); tmp4 = amd_bytealign ( 0x80, append[3], offset_minus_4); - const u32 mod = block_len & 3; - - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; @@ -254,13 +247,9 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c tmp3 = tmp4; tmp4 = 0x80; } - #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); @@ -268,10 +257,9 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c tmp2 = __byte_perm (append[1], append[2], selector); tmp3 = __byte_perm (append[2], append[3], selector); tmp4 = __byte_perm (append[3], 0x80, selector); - #endif - const u32 div = block_len / 4; + const u32 div = offset / 4; switch (div) { @@ -338,44 +326,38 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c } } -void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[2]) +void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[2]) { u32 tmp0; u32 tmp1; u32 tmp2; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC - - const int offset_minus_4 = 4 - (block_len & 3); - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign ( 0, append[1], offset_minus_4); - const u32 mod = block_len & 3; - - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0; } - #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); tmp1 = __byte_perm (append[0], append[1], selector); tmp2 = __byte_perm (append[1], 0, selector); - #endif - const u32 div = block_len / 4; + const u32 div = offset / 4; switch (div) { diff --git a/OpenCL/m01600-optimized.cl b/OpenCL/m01600-optimized.cl index fd5c4f293..d624b4678 100644 --- a/OpenCL/m01600-optimized.cl +++ b/OpenCL/m01600-optimized.cl @@ -113,7 +113,7 @@ void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 digest[3] += d; } -void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[4]) +void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[4]) { u32 tmp0; u32 tmp1; @@ -121,19 +121,18 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 tmp3; u32 tmp4; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC - - const int offset_minus_4 = 4 - (block_len & 3); - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); - const u32 mod = block_len & 3; - - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; @@ -141,13 +140,9 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const tmp3 = tmp4; tmp4 = 0; } - #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); @@ -155,10 +150,9 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const tmp2 = __byte_perm (append[1], append[2], selector); tmp3 = __byte_perm (append[2], append[3], selector); tmp4 = __byte_perm (append[3], 0, selector); - #endif - const u32 div = block_len / 4; + const u32 div = offset / 4; switch (div) { @@ -225,7 +219,7 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const } } -void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[4]) +void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[4]) { u32 tmp0; u32 tmp1; @@ -233,19 +227,18 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c u32 tmp3; u32 tmp4; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC - - const int offset_minus_4 = 4 - (block_len & 3); - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); tmp4 = amd_bytealign ( 0x80, append[3], offset_minus_4); - const u32 mod = block_len & 3; - - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; @@ -253,13 +246,9 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c tmp3 = tmp4; tmp4 = 0x80; } - #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); @@ -267,10 +256,9 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c tmp2 = __byte_perm (append[1], append[2], selector); tmp3 = __byte_perm (append[2], append[3], selector); tmp4 = __byte_perm (append[3], 0x80, selector); - #endif - const u32 div = block_len / 4; + const u32 div = offset / 4; switch (div) { @@ -337,44 +325,38 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c } } -void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[2]) +void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[2]) { u32 tmp0; u32 tmp1; u32 tmp2; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC - - const int offset_minus_4 = 4 - (block_len & 3); - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign ( 0, append[1], offset_minus_4); - const u32 mod = block_len & 3; - - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0; } - #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); tmp1 = __byte_perm (append[0], append[1], selector); tmp2 = __byte_perm (append[1], 0, selector); - #endif - const u32 div = block_len / 4; + const u32 div = offset / 4; switch (div) { diff --git a/OpenCL/m05800-optimized.cl b/OpenCL/m05800-optimized.cl index b84015eef..774b97327 100644 --- a/OpenCL/m05800-optimized.cl +++ b/OpenCL/m05800-optimized.cl @@ -2111,10 +2111,11 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3 u32 tmp4; u32 tmp5; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC - - const int offset_minus_4 = 4 - (offset & 3); - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); @@ -2122,9 +2123,7 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3 tmp4 = amd_bytealign (append[4], append[3], offset_minus_4); tmp5 = amd_bytealign ( 0, append[4], offset_minus_4); - const u32 mod = offset & 3; - - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; @@ -2133,13 +2132,9 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3 tmp4 = tmp5; tmp5 = 0; } - #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (offset & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); @@ -2148,7 +2143,6 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3 tmp3 = __byte_perm (append[2], append[3], selector); tmp4 = __byte_perm (append[3], append[4], selector); tmp5 = __byte_perm (append[4], 0, selector); - #endif const u32 div = offset / 4; diff --git a/OpenCL/m05800.cl b/OpenCL/m05800.cl index 3e9a9c72d..ec03f6218 100644 --- a/OpenCL/m05800.cl +++ b/OpenCL/m05800.cl @@ -2112,10 +2112,11 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3 u32 tmp4; u32 tmp5; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC - - const int offset_minus_4 = 4 - (offset & 3); - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); @@ -2123,9 +2124,7 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3 tmp4 = amd_bytealign (append[4], append[3], offset_minus_4); tmp5 = amd_bytealign ( 0, append[4], offset_minus_4); - const u32 mod = offset & 3; - - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; @@ -2134,13 +2133,9 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3 tmp4 = tmp5; tmp5 = 0; } - #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (offset & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); @@ -2149,7 +2144,6 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3 tmp3 = __byte_perm (append[2], append[3], selector); tmp4 = __byte_perm (append[3], append[4], selector); tmp5 = __byte_perm (append[4], 0, selector); - #endif const u32 div = offset / 4; diff --git a/OpenCL/m06300-optimized.cl b/OpenCL/m06300-optimized.cl index 0645a12fd..6b3980270 100644 --- a/OpenCL/m06300-optimized.cl +++ b/OpenCL/m06300-optimized.cl @@ -110,7 +110,7 @@ void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 digest[3] += d; } -void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[4]) +void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[4]) { u32 tmp0; u32 tmp1; @@ -118,19 +118,18 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 tmp3; u32 tmp4; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC - - const int offset_minus_4 = 4 - (block_len & 3); - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); - const u32 mod = block_len & 3; - - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; @@ -138,13 +137,9 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const tmp3 = tmp4; tmp4 = 0; } - #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); @@ -152,10 +147,9 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const tmp2 = __byte_perm (append[1], append[2], selector); tmp3 = __byte_perm (append[2], append[3], selector); tmp4 = __byte_perm (append[3], 0, selector); - #endif - const u32 div = block_len / 4; + const u32 div = offset / 4; switch (div) { @@ -222,7 +216,7 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const } } -void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[4]) +void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[4]) { u32 tmp0; u32 tmp1; @@ -230,19 +224,18 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c u32 tmp3; u32 tmp4; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC - - const int offset_minus_4 = 4 - (block_len & 3); - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); tmp4 = amd_bytealign ( 0x80, append[3], offset_minus_4); - const u32 mod = block_len & 3; - - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; @@ -250,13 +243,9 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c tmp3 = tmp4; tmp4 = 0x80; } - #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); @@ -264,10 +253,9 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c tmp2 = __byte_perm (append[1], append[2], selector); tmp3 = __byte_perm (append[2], append[3], selector); tmp4 = __byte_perm (append[3], 0x80, selector); - #endif - const u32 div = block_len / 4; + const u32 div = offset / 4; switch (div) { @@ -334,44 +322,38 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c } } -void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[2]) +void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[2]) { u32 tmp0; u32 tmp1; u32 tmp2; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC - - const int offset_minus_4 = 4 - (block_len & 3); - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign ( 0, append[1], offset_minus_4); - const u32 mod = block_len & 3; - - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0; } - #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); tmp1 = __byte_perm (append[0], append[1], selector); tmp2 = __byte_perm (append[1], 0, selector); - #endif - const u32 div = block_len / 4; + const u32 div = offset / 4; switch (div) { diff --git a/OpenCL/m07400-optimized.cl b/OpenCL/m07400-optimized.cl index e3a9fbda5..3483c57ec 100644 --- a/OpenCL/m07400-optimized.cl +++ b/OpenCL/m07400-optimized.cl @@ -190,27 +190,26 @@ void bswap8 (u32 block[16]) block[ 7] = swap32 (block[ 7]); } -u32 memcat16 (u32 block[16], const u32 block_len, const u32 append[4], const u32 append_len) +u32 memcat16 (u32 block[16], const u32 offset, const u32 append[4], const u32 append_len) { - const u32 mod = block_len & 3; - const u32 div = block_len / 4; - u32 tmp0; u32 tmp1; u32 tmp2; u32 tmp3; u32 tmp4; - #if defined IS_AMD || defined IS_GENERIC - const int offset_minus_4 = 4 - block_len; + const int offset_mod_4 = offset & 3; + const int offset_minus_4 = 4 - offset_mod_4; + + #if defined IS_AMD || defined IS_GENERIC tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; @@ -221,8 +220,6 @@ u32 memcat16 (u32 block[16], const u32 block_len, const u32 append[4], const u32 #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); @@ -232,7 +229,7 @@ u32 memcat16 (u32 block[16], const u32 block_len, const u32 append[4], const u32 tmp4 = __byte_perm (append[3], 0, selector); #endif - switch (div) + switch (offset / 4) { case 0: block[ 0] |= tmp0; block[ 1] = tmp1; @@ -322,32 +319,31 @@ u32 memcat16 (u32 block[16], const u32 block_len, const u32 append[4], const u32 break; } - u32 new_len = block_len + append_len; + u32 new_len = offset + append_len; return new_len; } -u32 memcat16c (u32 block[16], const u32 block_len, const u32 append[4], const u32 append_len, u32 digest[8]) +u32 memcat16c (u32 block[16], const u32 offset, const u32 append[4], const u32 append_len, u32 digest[8]) { - const u32 mod = block_len & 3; - const u32 div = block_len / 4; - u32 tmp0; u32 tmp1; u32 tmp2; u32 tmp3; u32 tmp4; - #if defined IS_AMD || defined IS_GENERIC - const int offset_minus_4 = 4 - block_len; + const int offset_mod_4 = offset & 3; + const int offset_minus_4 = 4 - offset_mod_4; + + #if defined IS_AMD || defined IS_GENERIC tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; @@ -358,8 +354,6 @@ u32 memcat16c (u32 block[16], const u32 block_len, const u32 append[4], const u3 #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); @@ -371,7 +365,7 @@ u32 memcat16c (u32 block[16], const u32 block_len, const u32 append[4], const u3 u32 carry[4] = { 0, 0, 0, 0 }; - switch (div) + switch (offset / 4) { case 0: block[ 0] |= tmp0; block[ 1] = tmp1; @@ -471,7 +465,7 @@ u32 memcat16c (u32 block[16], const u32 block_len, const u32 append[4], const u3 break; } - u32 new_len = block_len + append_len; + u32 new_len = offset + append_len; if (new_len >= 64) { @@ -490,27 +484,26 @@ u32 memcat16c (u32 block[16], const u32 block_len, const u32 append[4], const u3 return new_len; } -u32 memcat20 (u32 block[20], const u32 block_len, const u32 append[4], const u32 append_len) +u32 memcat20 (u32 block[20], const u32 offset, const u32 append[4], const u32 append_len) { - const u32 mod = block_len & 3; - const u32 div = block_len / 4; - u32 tmp0; u32 tmp1; u32 tmp2; u32 tmp3; u32 tmp4; - #if defined IS_AMD || defined IS_GENERIC - const int offset_minus_4 = 4 - block_len; + const int offset_mod_4 = offset & 3; + const int offset_minus_4 = 4 - offset_mod_4; + + #if defined IS_AMD || defined IS_GENERIC tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; @@ -521,8 +514,6 @@ u32 memcat20 (u32 block[20], const u32 block_len, const u32 append[4], const u32 #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); @@ -532,7 +523,7 @@ u32 memcat20 (u32 block[20], const u32 block_len, const u32 append[4], const u32 tmp4 = __byte_perm (append[3], 0, selector); #endif - switch (div) + switch (offset / 4) { case 0: block[ 0] |= tmp0; block[ 1] = tmp1; @@ -632,30 +623,29 @@ u32 memcat20 (u32 block[20], const u32 block_len, const u32 append[4], const u32 break; } - return block_len + append_len; + return offset + append_len; } -u32 memcat20_x80 (u32 block[20], const u32 block_len, const u32 append[4], const u32 append_len) +u32 memcat20_x80 (u32 block[20], const u32 offset, const u32 append[4], const u32 append_len) { - const u32 mod = block_len & 3; - const u32 div = block_len / 4; - u32 tmp0; u32 tmp1; u32 tmp2; u32 tmp3; u32 tmp4; - #if defined IS_AMD || defined IS_GENERIC - const int offset_minus_4 = 4 - block_len; + const int offset_mod_4 = offset & 3; + const int offset_minus_4 = 4 - offset_mod_4; + + #if defined IS_AMD || defined IS_GENERIC tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); tmp4 = amd_bytealign ( 0x80, append[3], offset_minus_4); - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; @@ -666,8 +656,6 @@ u32 memcat20_x80 (u32 block[20], const u32 block_len, const u32 append[4], const #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); @@ -677,7 +665,7 @@ u32 memcat20_x80 (u32 block[20], const u32 block_len, const u32 append[4], const tmp4 = __byte_perm (append[3], 0x80, selector); #endif - switch (div) + switch (offset / 4) { case 0: block[ 0] |= tmp0; block[ 1] = tmp1; @@ -777,7 +765,7 @@ u32 memcat20_x80 (u32 block[20], const u32 block_len, const u32 append[4], const break; } - return block_len + append_len; + return offset + append_len; } __kernel void m07400_init (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global sha256crypt_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) From b9a1e84093fa3c4895f8ae721045b8f072b2e758 Mon Sep 17 00:00:00 2001 From: jsteube Date: Sat, 5 Aug 2017 13:23:51 +0200 Subject: [PATCH 63/75] Fix test script for updated max password length of -m 9700 and -m 9800 --- tools/test.pl | 28 ++++++++++++++++++++++++++-- tools/test.sh | 2 +- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/tools/test.pl b/tools/test.pl index 5d65ee760..14d44fe26 100755 --- a/tools/test.pl +++ b/tools/test.pl @@ -3545,7 +3545,7 @@ sub passthrough $tmp_hash = gen_hash ($mode, $word_buf, substr ($salt_buf, 0, $salt_len)); } - elsif ($mode == 9400 || $mode == 9500 || $mode == 9600 || $mode == 9700 || $mode == 9800) + elsif ($mode == 9400 || $mode == 9500 || $mode == 9600) { next if length ($word_buf) > 19; @@ -3553,6 +3553,14 @@ sub passthrough $tmp_hash = gen_hash ($mode, $word_buf, substr ($salt_buf, 0, $salt_len)); } + elsif ($mode == 9700 || $mode == 9800) + { + next if length ($word_buf) > 15; + + my $salt_len = 32; + + $tmp_hash = gen_hash ($mode, $word_buf, substr ($salt_buf, 0, $salt_len)); + } elsif ($mode == 10100) { $tmp_hash = gen_hash ($mode, $word_buf, substr ($salt_buf, 0, 32)); @@ -4326,7 +4334,7 @@ sub single } } } - elsif ($mode == 9400 || $mode == 9500 || $mode == 9600 || $mode == 9700 || $mode == 9800) + elsif ($mode == 9400 || $mode == 9500 || $mode == 9600) { my $salt_len = 32; @@ -4342,6 +4350,22 @@ sub single } } } + elsif ($mode == 9700 || $mode == 9800) + { + my $salt_len = 32; + + for (my $i = 1; $i < 16; $i++) + { + if ($len != 0) + { + rnd ($mode, $len, $salt_len); + } + else + { + rnd ($mode, $i, $salt_len); + } + } + } elsif ($mode == 10100) { for (my $i = 1; $i < 32; $i++) diff --git a/tools/test.sh b/tools/test.sh index 090c09eca..51ee40313 100755 --- a/tools/test.sh +++ b/tools/test.sh @@ -24,7 +24,7 @@ NEVER_CRACK="11600 14900" SLOW_ALGOS="400 500 501 1600 1800 2100 2500 3200 5200 5800 6211 6212 6213 6221 6222 6223 6231 6232 6233 6241 6242 6243 6251 6261 6271 6281 6300 6400 6500 6600 6700 6800 7100 7200 7400 7900 8200 8800 8900 9000 9100 9200 9300 9400 9500 9600 10000 10300 10500 10700 10900 11300 11600 11900 12000 12001 12100 12200 12300 12400 12500 12700 12800 12900 13000 13200 13400 13600 14600 14700 14800 15100 15200 15300 15600 15700 15800" -OPTS="--quiet --force --potfile-disable --runtime 400 --gpu-temp-disable --weak-hash-threshold=0 " +OPTS="--quiet --force --potfile-disable --runtime 400 --gpu-temp-disable --weak-hash-threshold=0 -O" OUTD="test_$(date +%s)" From 6bafc385dc2845094b11d167dbc3ea67706a7cea Mon Sep 17 00:00:00 2001 From: jsteube Date: Sat, 5 Aug 2017 13:39:30 +0200 Subject: [PATCH 64/75] Use pure kernels in test.sh by default --- tools/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/test.sh b/tools/test.sh index 51ee40313..df97d144b 100755 --- a/tools/test.sh +++ b/tools/test.sh @@ -24,7 +24,7 @@ NEVER_CRACK="11600 14900" SLOW_ALGOS="400 500 501 1600 1800 2100 2500 3200 5200 5800 6211 6212 6213 6221 6222 6223 6231 6232 6233 6241 6242 6243 6251 6261 6271 6281 6300 6400 6500 6600 6700 6800 7100 7200 7400 7900 8200 8800 8900 9000 9100 9200 9300 9400 9500 9600 10000 10300 10500 10700 10900 11300 11600 11900 12000 12001 12100 12200 12300 12400 12500 12700 12800 12900 13000 13200 13400 13600 14600 14700 14800 15100 15200 15300 15600 15700 15800" -OPTS="--quiet --force --potfile-disable --runtime 400 --gpu-temp-disable --weak-hash-threshold=0 -O" +OPTS="--quiet --force --potfile-disable --runtime 400 --gpu-temp-disable --weak-hash-threshold=0" OUTD="test_$(date +%s)" From c5c12f89c1189a5d6761f0cca0151fd362e43737 Mon Sep 17 00:00:00 2001 From: jsteube Date: Sat, 5 Aug 2017 19:46:56 +0200 Subject: [PATCH 65/75] Rewrite code around amd_bytealign to be of type BE to save a branch afterwards --- OpenCL/inc_common.cl | 29794 ++++++++++++++++++++++++-------- OpenCL/inc_rp.cl | 214 +- OpenCL/m00500-optimized.cl | 214 +- OpenCL/m01600-optimized.cl | 214 +- OpenCL/m02810_a3.cl | 48 +- OpenCL/m05800-optimized.cl | 180 +- OpenCL/m05800.cl | 179 +- OpenCL/m06300-optimized.cl | 214 +- OpenCL/m07400-optimized.cl | 720 +- OpenCL/m11400_a0-optimized.cl | 261 +- OpenCL/m11400_a1-optimized.cl | 261 +- OpenCL/m11400_a3-optimized.cl | 261 +- OpenCL/m13800_a0-optimized.cl | 40 +- OpenCL/m13800_a1-optimized.cl | 40 +- OpenCL/m13800_a3-optimized.cl | 40 +- 15 files changed, 23824 insertions(+), 8856 deletions(-) diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl index 4673a5bc4..3e3fe573c 100644 --- a/OpenCL/inc_common.cl +++ b/OpenCL/inc_common.cl @@ -2944,244 +2944,156 @@ void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + switch (offset / 4) { case 0: - w3[3] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w3[2] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w3[1] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w3[0] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w2[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w2[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w2[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w1[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w1[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w1[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w0[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w0[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w0[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[0] = amd_bytealign (w0[0], 0, offset_minus_4); - - if (offset_mod_4 == 0) - { - w0[0] = w0[1]; - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } + w3[3] = amd_bytealign (w3[2], w3[3], offset); + w3[2] = amd_bytealign (w3[1], w3[2], offset); + w3[1] = amd_bytealign (w3[0], w3[1], offset); + w3[0] = amd_bytealign (w2[3], w3[0], offset); + w2[3] = amd_bytealign (w2[2], w2[3], offset); + w2[2] = amd_bytealign (w2[1], w2[2], offset); + w2[1] = amd_bytealign (w2[0], w2[1], offset); + w2[0] = amd_bytealign (w1[3], w2[0], offset); + w1[3] = amd_bytealign (w1[2], w1[3], offset); + w1[2] = amd_bytealign (w1[1], w1[2], offset); + w1[1] = amd_bytealign (w1[0], w1[1], offset); + w1[0] = amd_bytealign (w0[3], w1[0], offset); + w0[3] = amd_bytealign (w0[2], w0[3], offset); + w0[2] = amd_bytealign (w0[1], w0[2], offset); + w0[1] = amd_bytealign (w0[0], w0[1], offset); + w0[0] = amd_bytealign ( 0, w0[0], offset); break; case 1: - w3[3] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w3[2] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w3[1] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w3[0] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w2[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w2[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w1[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w1[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w0[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w0[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[1] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w3[1], w3[2], offset); + w3[2] = amd_bytealign (w3[0], w3[1], offset); + w3[1] = amd_bytealign (w2[3], w3[0], offset); + w3[0] = amd_bytealign (w2[2], w2[3], offset); + w2[3] = amd_bytealign (w2[1], w2[2], offset); + w2[2] = amd_bytealign (w2[0], w2[1], offset); + w2[1] = amd_bytealign (w1[3], w2[0], offset); + w2[0] = amd_bytealign (w1[2], w1[3], offset); + w1[3] = amd_bytealign (w1[1], w1[2], offset); + w1[2] = amd_bytealign (w1[0], w1[1], offset); + w1[1] = amd_bytealign (w0[3], w1[0], offset); + w1[0] = amd_bytealign (w0[2], w0[3], offset); + w0[3] = amd_bytealign (w0[1], w0[2], offset); + w0[2] = amd_bytealign (w0[0], w0[1], offset); + w0[1] = amd_bytealign ( 0, w0[0], offset); w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 2: - w3[3] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w3[2] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w3[1] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w3[0] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w2[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w1[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w0[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[2] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w3[0], w3[1], offset); + w3[2] = amd_bytealign (w2[3], w3[0], offset); + w3[1] = amd_bytealign (w2[2], w2[3], offset); + w3[0] = amd_bytealign (w2[1], w2[2], offset); + w2[3] = amd_bytealign (w2[0], w2[1], offset); + w2[2] = amd_bytealign (w1[3], w2[0], offset); + w2[1] = amd_bytealign (w1[2], w1[3], offset); + w2[0] = amd_bytealign (w1[1], w1[2], offset); + w1[3] = amd_bytealign (w1[0], w1[1], offset); + w1[2] = amd_bytealign (w0[3], w1[0], offset); + w1[1] = amd_bytealign (w0[2], w0[3], offset); + w1[0] = amd_bytealign (w0[1], w0[2], offset); + w0[3] = amd_bytealign (w0[0], w0[1], offset); + w0[2] = amd_bytealign ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 3: - w3[3] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w3[2] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w3[1] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w3[0] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[3] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w2[3], w3[0], offset); + w3[2] = amd_bytealign (w2[2], w2[3], offset); + w3[1] = amd_bytealign (w2[1], w2[2], offset); + w3[0] = amd_bytealign (w2[0], w2[1], offset); + w2[3] = amd_bytealign (w1[3], w2[0], offset); + w2[2] = amd_bytealign (w1[2], w1[3], offset); + w2[1] = amd_bytealign (w1[1], w1[2], offset); + w2[0] = amd_bytealign (w1[0], w1[1], offset); + w1[3] = amd_bytealign (w0[3], w1[0], offset); + w1[2] = amd_bytealign (w0[2], w0[3], offset); + w1[1] = amd_bytealign (w0[1], w0[2], offset); + w1[0] = amd_bytealign (w0[0], w0[1], offset); + w0[3] = amd_bytealign ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 4: - w3[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w3[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w3[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w3[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[0] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w2[2], w2[3], offset); + w3[2] = amd_bytealign (w2[1], w2[2], offset); + w3[1] = amd_bytealign (w2[0], w2[1], offset); + w3[0] = amd_bytealign (w1[3], w2[0], offset); + w2[3] = amd_bytealign (w1[2], w1[3], offset); + w2[2] = amd_bytealign (w1[1], w1[2], offset); + w2[1] = amd_bytealign (w1[0], w1[1], offset); + w2[0] = amd_bytealign (w0[3], w1[0], offset); + w1[3] = amd_bytealign (w0[2], w0[3], offset); + w1[2] = amd_bytealign (w0[1], w0[2], offset); + w1[1] = amd_bytealign (w0[0], w0[1], offset); + w1[0] = amd_bytealign ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 5: - w3[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w3[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w3[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w3[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[1] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w2[1], w2[2], offset); + w3[2] = amd_bytealign (w2[0], w2[1], offset); + w3[1] = amd_bytealign (w1[3], w2[0], offset); + w3[0] = amd_bytealign (w1[2], w1[3], offset); + w2[3] = amd_bytealign (w1[1], w1[2], offset); + w2[2] = amd_bytealign (w1[0], w1[1], offset); + w2[1] = amd_bytealign (w0[3], w1[0], offset); + w2[0] = amd_bytealign (w0[2], w0[3], offset); + w1[3] = amd_bytealign (w0[1], w0[2], offset); + w1[2] = amd_bytealign (w0[0], w0[1], offset); + w1[1] = amd_bytealign ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 6: - w3[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w3[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w3[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w3[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[2] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w2[0], w2[1], offset); + w3[2] = amd_bytealign (w1[3], w2[0], offset); + w3[1] = amd_bytealign (w1[2], w1[3], offset); + w3[0] = amd_bytealign (w1[1], w1[2], offset); + w2[3] = amd_bytealign (w1[0], w1[1], offset); + w2[2] = amd_bytealign (w0[3], w1[0], offset); + w2[1] = amd_bytealign (w0[2], w0[3], offset); + w2[0] = amd_bytealign (w0[1], w0[2], offset); + w1[3] = amd_bytealign (w0[0], w0[1], offset); + w1[2] = amd_bytealign ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -3189,32 +3101,18 @@ void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 7: - w3[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w3[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w3[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w3[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[3] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w1[3], w2[0], offset); + w3[2] = amd_bytealign (w1[2], w1[3], offset); + w3[1] = amd_bytealign (w1[1], w1[2], offset); + w3[0] = amd_bytealign (w1[0], w1[1], offset); + w2[3] = amd_bytealign (w0[3], w1[0], offset); + w2[2] = amd_bytealign (w0[2], w0[3], offset); + w2[1] = amd_bytealign (w0[1], w0[2], offset); + w2[0] = amd_bytealign (w0[0], w0[1], offset); + w1[3] = amd_bytealign ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -3223,30 +3121,17 @@ void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 8: - w3[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w3[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w3[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w3[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[0] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w1[2], w1[3], offset); + w3[2] = amd_bytealign (w1[1], w1[2], offset); + w3[1] = amd_bytealign (w1[0], w1[1], offset); + w3[0] = amd_bytealign (w0[3], w1[0], offset); + w2[3] = amd_bytealign (w0[2], w0[3], offset); + w2[2] = amd_bytealign (w0[1], w0[2], offset); + w2[1] = amd_bytealign (w0[0], w0[1], offset); + w2[0] = amd_bytealign ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -3256,28 +3141,16 @@ void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 9: - w3[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w3[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w3[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w3[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[1] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w1[1], w1[2], offset); + w3[2] = amd_bytealign (w1[0], w1[1], offset); + w3[1] = amd_bytealign (w0[3], w1[0], offset); + w3[0] = amd_bytealign (w0[2], w0[3], offset); + w2[3] = amd_bytealign (w0[1], w0[2], offset); + w2[2] = amd_bytealign (w0[0], w0[1], offset); + w2[1] = amd_bytealign ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -3288,26 +3161,15 @@ void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 10: - w3[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w3[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w3[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w3[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[2] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w1[0], w1[1], offset); + w3[2] = amd_bytealign (w0[3], w1[0], offset); + w3[1] = amd_bytealign (w0[2], w0[3], offset); + w3[0] = amd_bytealign (w0[1], w0[2], offset); + w2[3] = amd_bytealign (w0[0], w0[1], offset); + w2[2] = amd_bytealign ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -3319,24 +3181,14 @@ void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 11: - w3[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w3[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w3[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w3[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[3] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w0[3], w1[0], offset); + w3[2] = amd_bytealign (w0[2], w0[3], offset); + w3[1] = amd_bytealign (w0[1], w0[2], offset); + w3[0] = amd_bytealign (w0[0], w0[1], offset); + w2[3] = amd_bytealign ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -3349,22 +3201,13 @@ void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 12: - w3[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w3[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w3[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w3[0] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w0[2], w0[3], offset); + w3[2] = amd_bytealign (w0[1], w0[2], offset); + w3[1] = amd_bytealign (w0[0], w0[1], offset); + w3[0] = amd_bytealign ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -3378,20 +3221,12 @@ void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 13: - w3[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w3[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w3[1] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w0[1], w0[2], offset); + w3[2] = amd_bytealign (w0[0], w0[1], offset); + w3[1] = amd_bytealign ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -3406,18 +3241,11 @@ void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 14: - w3[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w3[2] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign (w0[0], w0[1], offset); + w3[2] = amd_bytealign ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -3433,16 +3261,10 @@ void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 15: - w3[3] = amd_bytealign (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -3459,13 +3281,25 @@ void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[3] = 0; - } - break; } + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); #endif #ifdef IS_NV @@ -3802,6 +3636,519 @@ void switch_buffer_by_offset_carry_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + + switch (offset / 4) + { + case 0: + c0[0] = amd_bytealign (w3[3], 0, offset); + w3[3] = amd_bytealign (w3[2], w3[3], offset); + w3[2] = amd_bytealign (w3[1], w3[2], offset); + w3[1] = amd_bytealign (w3[0], w3[1], offset); + w3[0] = amd_bytealign (w2[3], w3[0], offset); + w2[3] = amd_bytealign (w2[2], w2[3], offset); + w2[2] = amd_bytealign (w2[1], w2[2], offset); + w2[1] = amd_bytealign (w2[0], w2[1], offset); + w2[0] = amd_bytealign (w1[3], w2[0], offset); + w1[3] = amd_bytealign (w1[2], w1[3], offset); + w1[2] = amd_bytealign (w1[1], w1[2], offset); + w1[1] = amd_bytealign (w1[0], w1[1], offset); + w1[0] = amd_bytealign (w0[3], w1[0], offset); + w0[3] = amd_bytealign (w0[2], w0[3], offset); + w0[2] = amd_bytealign (w0[1], w0[2], offset); + w0[1] = amd_bytealign (w0[0], w0[1], offset); + w0[0] = amd_bytealign ( 0, w0[0], offset); + + break; + + case 1: + c0[1] = amd_bytealign (w3[3], 0, offset); + c0[0] = amd_bytealign (w3[2], w3[3], offset); + w3[3] = amd_bytealign (w3[1], w3[2], offset); + w3[2] = amd_bytealign (w3[0], w3[1], offset); + w3[1] = amd_bytealign (w2[3], w3[0], offset); + w3[0] = amd_bytealign (w2[2], w2[3], offset); + w2[3] = amd_bytealign (w2[1], w2[2], offset); + w2[2] = amd_bytealign (w2[0], w2[1], offset); + w2[1] = amd_bytealign (w1[3], w2[0], offset); + w2[0] = amd_bytealign (w1[2], w1[3], offset); + w1[3] = amd_bytealign (w1[1], w1[2], offset); + w1[2] = amd_bytealign (w1[0], w1[1], offset); + w1[1] = amd_bytealign (w0[3], w1[0], offset); + w1[0] = amd_bytealign (w0[2], w0[3], offset); + w0[3] = amd_bytealign (w0[1], w0[2], offset); + w0[2] = amd_bytealign (w0[0], w0[1], offset); + w0[1] = amd_bytealign ( 0, w0[0], offset); + w0[0] = 0; + + break; + + case 2: + c0[2] = amd_bytealign (w3[3], 0, offset); + c0[1] = amd_bytealign (w3[2], w3[3], offset); + c0[0] = amd_bytealign (w3[1], w3[2], offset); + w3[3] = amd_bytealign (w3[0], w3[1], offset); + w3[2] = amd_bytealign (w2[3], w3[0], offset); + w3[1] = amd_bytealign (w2[2], w2[3], offset); + w3[0] = amd_bytealign (w2[1], w2[2], offset); + w2[3] = amd_bytealign (w2[0], w2[1], offset); + w2[2] = amd_bytealign (w1[3], w2[0], offset); + w2[1] = amd_bytealign (w1[2], w1[3], offset); + w2[0] = amd_bytealign (w1[1], w1[2], offset); + w1[3] = amd_bytealign (w1[0], w1[1], offset); + w1[2] = amd_bytealign (w0[3], w1[0], offset); + w1[1] = amd_bytealign (w0[2], w0[3], offset); + w1[0] = amd_bytealign (w0[1], w0[2], offset); + w0[3] = amd_bytealign (w0[0], w0[1], offset); + w0[2] = amd_bytealign ( 0, w0[0], offset); + w0[1] = 0; + w0[0] = 0; + + break; + + case 3: + c0[3] = amd_bytealign (w3[3], 0, offset); + c0[2] = amd_bytealign (w3[2], w3[3], offset); + c0[1] = amd_bytealign (w3[1], w3[2], offset); + c0[0] = amd_bytealign (w3[0], w3[1], offset); + w3[3] = amd_bytealign (w2[3], w3[0], offset); + w3[2] = amd_bytealign (w2[2], w2[3], offset); + w3[1] = amd_bytealign (w2[1], w2[2], offset); + w3[0] = amd_bytealign (w2[0], w2[1], offset); + w2[3] = amd_bytealign (w1[3], w2[0], offset); + w2[2] = amd_bytealign (w1[2], w1[3], offset); + w2[1] = amd_bytealign (w1[1], w1[2], offset); + w2[0] = amd_bytealign (w1[0], w1[1], offset); + w1[3] = amd_bytealign (w0[3], w1[0], offset); + w1[2] = amd_bytealign (w0[2], w0[3], offset); + w1[1] = amd_bytealign (w0[1], w0[2], offset); + w1[0] = amd_bytealign (w0[0], w0[1], offset); + w0[3] = amd_bytealign ( 0, w0[0], offset); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 4: + c1[0] = amd_bytealign (w3[3], 0, offset); + c0[3] = amd_bytealign (w3[2], w3[3], offset); + c0[2] = amd_bytealign (w3[1], w3[2], offset); + c0[1] = amd_bytealign (w3[0], w3[1], offset); + c0[0] = amd_bytealign (w2[3], w3[0], offset); + w3[3] = amd_bytealign (w2[2], w2[3], offset); + w3[2] = amd_bytealign (w2[1], w2[2], offset); + w3[1] = amd_bytealign (w2[0], w2[1], offset); + w3[0] = amd_bytealign (w1[3], w2[0], offset); + w2[3] = amd_bytealign (w1[2], w1[3], offset); + w2[2] = amd_bytealign (w1[1], w1[2], offset); + w2[1] = amd_bytealign (w1[0], w1[1], offset); + w2[0] = amd_bytealign (w0[3], w1[0], offset); + w1[3] = amd_bytealign (w0[2], w0[3], offset); + w1[2] = amd_bytealign (w0[1], w0[2], offset); + w1[1] = amd_bytealign (w0[0], w0[1], offset); + w1[0] = amd_bytealign ( 0, w0[0], offset); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 5: + c1[1] = amd_bytealign (w3[3], 0, offset); + c1[0] = amd_bytealign (w3[2], w3[3], offset); + c0[3] = amd_bytealign (w3[1], w3[2], offset); + c0[2] = amd_bytealign (w3[0], w3[1], offset); + c0[1] = amd_bytealign (w2[3], w3[0], offset); + c0[0] = amd_bytealign (w2[2], w2[3], offset); + w3[3] = amd_bytealign (w2[1], w2[2], offset); + w3[2] = amd_bytealign (w2[0], w2[1], offset); + w3[1] = amd_bytealign (w1[3], w2[0], offset); + w3[0] = amd_bytealign (w1[2], w1[3], offset); + w2[3] = amd_bytealign (w1[1], w1[2], offset); + w2[2] = amd_bytealign (w1[0], w1[1], offset); + w2[1] = amd_bytealign (w0[3], w1[0], offset); + w2[0] = amd_bytealign (w0[2], w0[3], offset); + w1[3] = amd_bytealign (w0[1], w0[2], offset); + w1[2] = amd_bytealign (w0[0], w0[1], offset); + w1[1] = amd_bytealign ( 0, w0[0], offset); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 6: + c1[2] = amd_bytealign (w3[3], 0, offset); + c1[1] = amd_bytealign (w3[2], w3[3], offset); + c1[0] = amd_bytealign (w3[1], w3[2], offset); + c0[3] = amd_bytealign (w3[0], w3[1], offset); + c0[2] = amd_bytealign (w2[3], w3[0], offset); + c0[1] = amd_bytealign (w2[2], w2[3], offset); + c0[0] = amd_bytealign (w2[1], w2[2], offset); + w3[3] = amd_bytealign (w2[0], w2[1], offset); + w3[2] = amd_bytealign (w1[3], w2[0], offset); + w3[1] = amd_bytealign (w1[2], w1[3], offset); + w3[0] = amd_bytealign (w1[1], w1[2], offset); + w2[3] = amd_bytealign (w1[0], w1[1], offset); + w2[2] = amd_bytealign (w0[3], w1[0], offset); + w2[1] = amd_bytealign (w0[2], w0[3], offset); + w2[0] = amd_bytealign (w0[1], w0[2], offset); + w1[3] = amd_bytealign (w0[0], w0[1], offset); + w1[2] = amd_bytealign ( 0, w0[0], offset); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 7: + c1[3] = amd_bytealign (w3[3], 0, offset); + c1[2] = amd_bytealign (w3[2], w3[3], offset); + c1[1] = amd_bytealign (w3[1], w3[2], offset); + c1[0] = amd_bytealign (w3[0], w3[1], offset); + c0[3] = amd_bytealign (w2[3], w3[0], offset); + c0[2] = amd_bytealign (w2[2], w2[3], offset); + c0[1] = amd_bytealign (w2[1], w2[2], offset); + c0[0] = amd_bytealign (w2[0], w2[1], offset); + w3[3] = amd_bytealign (w1[3], w2[0], offset); + w3[2] = amd_bytealign (w1[2], w1[3], offset); + w3[1] = amd_bytealign (w1[1], w1[2], offset); + w3[0] = amd_bytealign (w1[0], w1[1], offset); + w2[3] = amd_bytealign (w0[3], w1[0], offset); + w2[2] = amd_bytealign (w0[2], w0[3], offset); + w2[1] = amd_bytealign (w0[1], w0[2], offset); + w2[0] = amd_bytealign (w0[0], w0[1], offset); + w1[3] = amd_bytealign ( 0, w0[0], offset); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 8: + c2[0] = amd_bytealign (w3[3], 0, offset); + c1[3] = amd_bytealign (w3[2], w3[3], offset); + c1[2] = amd_bytealign (w3[1], w3[2], offset); + c1[1] = amd_bytealign (w3[0], w3[1], offset); + c1[0] = amd_bytealign (w2[3], w3[0], offset); + c0[3] = amd_bytealign (w2[2], w2[3], offset); + c0[2] = amd_bytealign (w2[1], w2[2], offset); + c0[1] = amd_bytealign (w2[0], w2[1], offset); + c0[0] = amd_bytealign (w1[3], w2[0], offset); + w3[3] = amd_bytealign (w1[2], w1[3], offset); + w3[2] = amd_bytealign (w1[1], w1[2], offset); + w3[1] = amd_bytealign (w1[0], w1[1], offset); + w3[0] = amd_bytealign (w0[3], w1[0], offset); + w2[3] = amd_bytealign (w0[2], w0[3], offset); + w2[2] = amd_bytealign (w0[1], w0[2], offset); + w2[1] = amd_bytealign (w0[0], w0[1], offset); + w2[0] = amd_bytealign ( 0, w0[0], offset); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 9: + c2[1] = amd_bytealign (w3[3], 0, offset); + c2[0] = amd_bytealign (w3[2], w3[3], offset); + c1[3] = amd_bytealign (w3[1], w3[2], offset); + c1[2] = amd_bytealign (w3[0], w3[1], offset); + c1[1] = amd_bytealign (w2[3], w3[0], offset); + c1[0] = amd_bytealign (w2[2], w2[3], offset); + c0[3] = amd_bytealign (w2[1], w2[2], offset); + c0[2] = amd_bytealign (w2[0], w2[1], offset); + c0[1] = amd_bytealign (w1[3], w2[0], offset); + c0[0] = amd_bytealign (w1[2], w1[3], offset); + w3[3] = amd_bytealign (w1[1], w1[2], offset); + w3[2] = amd_bytealign (w1[0], w1[1], offset); + w3[1] = amd_bytealign (w0[3], w1[0], offset); + w3[0] = amd_bytealign (w0[2], w0[3], offset); + w2[3] = amd_bytealign (w0[1], w0[2], offset); + w2[2] = amd_bytealign (w0[0], w0[1], offset); + w2[1] = amd_bytealign ( 0, w0[0], offset); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 10: + c2[2] = amd_bytealign (w3[3], 0, offset); + c2[1] = amd_bytealign (w3[2], w3[3], offset); + c2[0] = amd_bytealign (w3[1], w3[2], offset); + c1[3] = amd_bytealign (w3[0], w3[1], offset); + c1[2] = amd_bytealign (w2[3], w3[0], offset); + c1[1] = amd_bytealign (w2[2], w2[3], offset); + c1[0] = amd_bytealign (w2[1], w2[2], offset); + c0[3] = amd_bytealign (w2[0], w2[1], offset); + c0[2] = amd_bytealign (w1[3], w2[0], offset); + c0[1] = amd_bytealign (w1[2], w1[3], offset); + c0[0] = amd_bytealign (w1[1], w1[2], offset); + w3[3] = amd_bytealign (w1[0], w1[1], offset); + w3[2] = amd_bytealign (w0[3], w1[0], offset); + w3[1] = amd_bytealign (w0[2], w0[3], offset); + w3[0] = amd_bytealign (w0[1], w0[2], offset); + w2[3] = amd_bytealign (w0[0], w0[1], offset); + w2[2] = amd_bytealign ( 0, w0[0], offset); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 11: + c2[3] = amd_bytealign (w3[3], 0, offset); + c2[2] = amd_bytealign (w3[2], w3[3], offset); + c2[1] = amd_bytealign (w3[1], w3[2], offset); + c2[0] = amd_bytealign (w3[0], w3[1], offset); + c1[3] = amd_bytealign (w2[3], w3[0], offset); + c1[2] = amd_bytealign (w2[2], w2[3], offset); + c1[1] = amd_bytealign (w2[1], w2[2], offset); + c1[0] = amd_bytealign (w2[0], w2[1], offset); + c0[3] = amd_bytealign (w1[3], w2[0], offset); + c0[2] = amd_bytealign (w1[2], w1[3], offset); + c0[1] = amd_bytealign (w1[1], w1[2], offset); + c0[0] = amd_bytealign (w1[0], w1[1], offset); + w3[3] = amd_bytealign (w0[3], w1[0], offset); + w3[2] = amd_bytealign (w0[2], w0[3], offset); + w3[1] = amd_bytealign (w0[1], w0[2], offset); + w3[0] = amd_bytealign (w0[0], w0[1], offset); + w2[3] = amd_bytealign ( 0, w0[0], offset); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 12: + c3[0] = amd_bytealign (w3[3], 0, offset); + c2[3] = amd_bytealign (w3[2], w3[3], offset); + c2[2] = amd_bytealign (w3[1], w3[2], offset); + c2[1] = amd_bytealign (w3[0], w3[1], offset); + c2[0] = amd_bytealign (w2[3], w3[0], offset); + c1[3] = amd_bytealign (w2[2], w2[3], offset); + c1[2] = amd_bytealign (w2[1], w2[2], offset); + c1[1] = amd_bytealign (w2[0], w2[1], offset); + c1[0] = amd_bytealign (w1[3], w2[0], offset); + c0[3] = amd_bytealign (w1[2], w1[3], offset); + c0[2] = amd_bytealign (w1[1], w1[2], offset); + c0[1] = amd_bytealign (w1[0], w1[1], offset); + c0[0] = amd_bytealign (w0[3], w1[0], offset); + w3[3] = amd_bytealign (w0[2], w0[3], offset); + w3[2] = amd_bytealign (w0[1], w0[2], offset); + w3[1] = amd_bytealign (w0[0], w0[1], offset); + w3[0] = amd_bytealign ( 0, w0[0], offset); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 13: + c3[1] = amd_bytealign (w3[3], 0, offset); + c3[0] = amd_bytealign (w3[2], w3[3], offset); + c2[3] = amd_bytealign (w3[1], w3[2], offset); + c2[2] = amd_bytealign (w3[0], w3[1], offset); + c2[1] = amd_bytealign (w2[3], w3[0], offset); + c2[0] = amd_bytealign (w2[2], w2[3], offset); + c1[3] = amd_bytealign (w2[1], w2[2], offset); + c1[2] = amd_bytealign (w2[0], w2[1], offset); + c1[1] = amd_bytealign (w1[3], w2[0], offset); + c1[0] = amd_bytealign (w1[2], w1[3], offset); + c0[3] = amd_bytealign (w1[1], w1[2], offset); + c0[2] = amd_bytealign (w1[0], w1[1], offset); + c0[1] = amd_bytealign (w0[3], w1[0], offset); + c0[0] = amd_bytealign (w0[2], w0[3], offset); + w3[3] = amd_bytealign (w0[1], w0[2], offset); + w3[2] = amd_bytealign (w0[0], w0[1], offset); + w3[1] = amd_bytealign ( 0, w0[0], offset); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 14: + c3[2] = amd_bytealign (w3[3], 0, offset); + c3[1] = amd_bytealign (w3[2], w3[3], offset); + c3[0] = amd_bytealign (w3[1], w3[2], offset); + c2[3] = amd_bytealign (w3[0], w3[1], offset); + c2[2] = amd_bytealign (w2[3], w3[0], offset); + c2[1] = amd_bytealign (w2[2], w2[3], offset); + c2[0] = amd_bytealign (w2[1], w2[2], offset); + c1[3] = amd_bytealign (w2[0], w2[1], offset); + c1[2] = amd_bytealign (w1[3], w2[0], offset); + c1[1] = amd_bytealign (w1[2], w1[3], offset); + c1[0] = amd_bytealign (w1[1], w1[2], offset); + c0[3] = amd_bytealign (w1[0], w1[1], offset); + c0[2] = amd_bytealign (w0[3], w1[0], offset); + c0[1] = amd_bytealign (w0[2], w0[3], offset); + c0[0] = amd_bytealign (w0[1], w0[2], offset); + w3[3] = amd_bytealign (w0[0], w0[1], offset); + w3[2] = amd_bytealign ( 0, w0[0], offset); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 15: + c3[3] = amd_bytealign (w3[3], 0, offset); + c3[2] = amd_bytealign (w3[2], w3[3], offset); + c3[1] = amd_bytealign (w3[1], w3[2], offset); + c3[0] = amd_bytealign (w3[0], w3[1], offset); + c2[3] = amd_bytealign (w2[3], w3[0], offset); + c2[2] = amd_bytealign (w2[2], w2[3], offset); + c2[1] = amd_bytealign (w2[1], w2[2], offset); + c2[0] = amd_bytealign (w2[0], w2[1], offset); + c1[3] = amd_bytealign (w1[3], w2[0], offset); + c1[2] = amd_bytealign (w1[2], w1[3], offset); + c1[1] = amd_bytealign (w1[1], w1[2], offset); + c1[0] = amd_bytealign (w1[0], w1[1], offset); + c0[3] = amd_bytealign (w0[3], w1[0], offset); + c0[2] = amd_bytealign (w0[2], w0[3], offset); + c0[1] = amd_bytealign (w0[1], w0[2], offset); + c0[0] = amd_bytealign (w0[0], w0[1], offset); + w3[3] = amd_bytealign ( 0, w0[0], offset); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + } + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + c0[0] = swap32 (c0[0]); + c0[1] = swap32 (c0[1]); + c0[2] = swap32 (c0[2]); + c0[3] = swap32 (c0[3]); + c1[0] = swap32 (c1[0]); + c1[1] = swap32 (c1[1]); + c1[2] = swap32 (c1[2]); + c1[3] = swap32 (c1[3]); + c2[0] = swap32 (c2[0]); + c2[1] = swap32 (c2[1]); + c2[2] = swap32 (c2[2]); + c2[3] = swap32 (c2[3]); + c3[0] = swap32 (c3[0]); + c3[1] = swap32 (c3[1]); + c3[2] = swap32 (c3[2]); + c3[3] = swap32 (c3[3]); + #endif + + #ifdef IS_NV + // todo switch (offset / 4) { case 0: @@ -4596,6 +4943,7 @@ void switch_buffer_by_offset_carry_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x break; } + #endif } void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) @@ -6187,452 +6535,284 @@ void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3 const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + w4[0] = swap32 (w4[0]); + w4[1] = swap32 (w4[1]); + w4[2] = swap32 (w4[2]); + w4[3] = swap32 (w4[3]); + w5[0] = swap32 (w5[0]); + w5[1] = swap32 (w5[1]); + w5[2] = swap32 (w5[2]); + w5[3] = swap32 (w5[3]); + w6[0] = swap32 (w6[0]); + w6[1] = swap32 (w6[1]); + w6[2] = swap32 (w6[2]); + w6[3] = swap32 (w6[3]); + w7[0] = swap32 (w7[0]); + w7[1] = swap32 (w7[1]); + w7[2] = swap32 (w7[2]); + w7[3] = swap32 (w7[3]); + switch (offset / 4) { - case 0: - w7[3] = amd_bytealign (w7[3], w7[2], offset_minus_4); - w7[2] = amd_bytealign (w7[2], w7[1], offset_minus_4); - w7[1] = amd_bytealign (w7[1], w7[0], offset_minus_4); - w7[0] = amd_bytealign (w7[0], w6[3], offset_minus_4); - w6[3] = amd_bytealign (w6[3], w6[2], offset_minus_4); - w6[2] = amd_bytealign (w6[2], w6[1], offset_minus_4); - w6[1] = amd_bytealign (w6[1], w6[0], offset_minus_4); - w6[0] = amd_bytealign (w6[0], w5[3], offset_minus_4); - w5[3] = amd_bytealign (w5[3], w5[2], offset_minus_4); - w5[2] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w5[1] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w5[0] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w4[3] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w4[2] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w4[1] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w4[0] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w3[3] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w3[2] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w3[1] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w3[0] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w2[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w2[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w2[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w1[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w1[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w1[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w0[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w0[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w0[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[0] = amd_bytealign (w0[0], 0, offset_minus_4); - - if (offset_mod_4 == 0) - { - w0[0] = w0[1]; - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } + case 0: + w7[3] = amd_bytealign (w7[2], w7[3], offset); + w7[2] = amd_bytealign (w7[1], w7[2], offset); + w7[1] = amd_bytealign (w7[0], w7[1], offset); + w7[0] = amd_bytealign (w6[3], w7[0], offset); + w6[3] = amd_bytealign (w6[2], w6[3], offset); + w6[2] = amd_bytealign (w6[1], w6[2], offset); + w6[1] = amd_bytealign (w6[0], w6[1], offset); + w6[0] = amd_bytealign (w5[3], w6[0], offset); + w5[3] = amd_bytealign (w5[2], w5[3], offset); + w5[2] = amd_bytealign (w5[1], w5[2], offset); + w5[1] = amd_bytealign (w5[0], w5[1], offset); + w5[0] = amd_bytealign (w4[3], w5[0], offset); + w4[3] = amd_bytealign (w4[2], w4[3], offset); + w4[2] = amd_bytealign (w4[1], w4[2], offset); + w4[1] = amd_bytealign (w4[0], w4[1], offset); + w4[0] = amd_bytealign (w3[3], w4[0], offset); + w3[3] = amd_bytealign (w3[2], w3[3], offset); + w3[2] = amd_bytealign (w3[1], w3[2], offset); + w3[1] = amd_bytealign (w3[0], w3[1], offset); + w3[0] = amd_bytealign (w2[3], w3[0], offset); + w2[3] = amd_bytealign (w2[2], w2[3], offset); + w2[2] = amd_bytealign (w2[1], w2[2], offset); + w2[1] = amd_bytealign (w2[0], w2[1], offset); + w2[0] = amd_bytealign (w1[3], w2[0], offset); + w1[3] = amd_bytealign (w1[2], w1[3], offset); + w1[2] = amd_bytealign (w1[1], w1[2], offset); + w1[1] = amd_bytealign (w1[0], w1[1], offset); + w1[0] = amd_bytealign (w0[3], w1[0], offset); + w0[3] = amd_bytealign (w0[2], w0[3], offset); + w0[2] = amd_bytealign (w0[1], w0[2], offset); + w0[1] = amd_bytealign (w0[0], w0[1], offset); + w0[0] = amd_bytealign ( 0, w0[0], offset); break; - case 1: - w7[3] = amd_bytealign (w7[2], w7[1], offset_minus_4); - w7[2] = amd_bytealign (w7[1], w7[0], offset_minus_4); - w7[1] = amd_bytealign (w7[0], w6[3], offset_minus_4); - w7[0] = amd_bytealign (w6[3], w6[2], offset_minus_4); - w6[3] = amd_bytealign (w6[2], w6[1], offset_minus_4); - w6[2] = amd_bytealign (w6[1], w6[0], offset_minus_4); - w6[1] = amd_bytealign (w6[0], w5[3], offset_minus_4); - w6[0] = amd_bytealign (w5[3], w5[2], offset_minus_4); - w5[3] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w5[2] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w5[1] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w5[0] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w4[3] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w4[2] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w4[1] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w4[0] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w3[3] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w3[2] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w3[1] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w3[0] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w2[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w2[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w1[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w1[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w0[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w0[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[1] = amd_bytealign (w0[0], 0, offset_minus_4); + case 1: + w7[3] = amd_bytealign (w7[1], w7[2], offset); + w7[2] = amd_bytealign (w7[0], w7[1], offset); + w7[1] = amd_bytealign (w6[3], w7[0], offset); + w7[0] = amd_bytealign (w6[2], w6[3], offset); + w6[3] = amd_bytealign (w6[1], w6[2], offset); + w6[2] = amd_bytealign (w6[0], w6[1], offset); + w6[1] = amd_bytealign (w5[3], w6[0], offset); + w6[0] = amd_bytealign (w5[2], w5[3], offset); + w5[3] = amd_bytealign (w5[1], w5[2], offset); + w5[2] = amd_bytealign (w5[0], w5[1], offset); + w5[1] = amd_bytealign (w4[3], w5[0], offset); + w5[0] = amd_bytealign (w4[2], w4[3], offset); + w4[3] = amd_bytealign (w4[1], w4[2], offset); + w4[2] = amd_bytealign (w4[0], w4[1], offset); + w4[1] = amd_bytealign (w3[3], w4[0], offset); + w4[0] = amd_bytealign (w3[2], w3[3], offset); + w3[3] = amd_bytealign (w3[1], w3[2], offset); + w3[2] = amd_bytealign (w3[0], w3[1], offset); + w3[1] = amd_bytealign (w2[3], w3[0], offset); + w3[0] = amd_bytealign (w2[2], w2[3], offset); + w2[3] = amd_bytealign (w2[1], w2[2], offset); + w2[2] = amd_bytealign (w2[0], w2[1], offset); + w2[1] = amd_bytealign (w1[3], w2[0], offset); + w2[0] = amd_bytealign (w1[2], w1[3], offset); + w1[3] = amd_bytealign (w1[1], w1[2], offset); + w1[2] = amd_bytealign (w1[0], w1[1], offset); + w1[1] = amd_bytealign (w0[3], w1[0], offset); + w1[0] = amd_bytealign (w0[2], w0[3], offset); + w0[3] = amd_bytealign (w0[1], w0[2], offset); + w0[2] = amd_bytealign (w0[0], w0[1], offset); + w0[1] = amd_bytealign ( 0, w0[0], offset); w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 2: - w7[3] = amd_bytealign (w7[1], w7[0], offset_minus_4); - w7[2] = amd_bytealign (w7[0], w6[3], offset_minus_4); - w7[1] = amd_bytealign (w6[3], w6[2], offset_minus_4); - w7[0] = amd_bytealign (w6[2], w6[1], offset_minus_4); - w6[3] = amd_bytealign (w6[1], w6[0], offset_minus_4); - w6[2] = amd_bytealign (w6[0], w5[3], offset_minus_4); - w6[1] = amd_bytealign (w5[3], w5[2], offset_minus_4); - w6[0] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w5[3] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w5[2] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w5[1] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w5[0] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w4[3] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w4[2] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w4[1] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w4[0] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w3[3] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w3[2] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w3[1] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w3[0] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w2[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w1[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w0[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[2] = amd_bytealign (w0[0], 0, offset_minus_4); + case 2: + w7[3] = amd_bytealign (w7[0], w7[1], offset); + w7[2] = amd_bytealign (w6[3], w7[0], offset); + w7[1] = amd_bytealign (w6[2], w6[3], offset); + w7[0] = amd_bytealign (w6[1], w6[2], offset); + w6[3] = amd_bytealign (w6[0], w6[1], offset); + w6[2] = amd_bytealign (w5[3], w6[0], offset); + w6[1] = amd_bytealign (w5[2], w5[3], offset); + w6[0] = amd_bytealign (w5[1], w5[2], offset); + w5[3] = amd_bytealign (w5[0], w5[1], offset); + w5[2] = amd_bytealign (w4[3], w5[0], offset); + w5[1] = amd_bytealign (w4[2], w4[3], offset); + w5[0] = amd_bytealign (w4[1], w4[2], offset); + w4[3] = amd_bytealign (w4[0], w4[1], offset); + w4[2] = amd_bytealign (w3[3], w4[0], offset); + w4[1] = amd_bytealign (w3[2], w3[3], offset); + w4[0] = amd_bytealign (w3[1], w3[2], offset); + w3[3] = amd_bytealign (w3[0], w3[1], offset); + w3[2] = amd_bytealign (w2[3], w3[0], offset); + w3[1] = amd_bytealign (w2[2], w2[3], offset); + w3[0] = amd_bytealign (w2[1], w2[2], offset); + w2[3] = amd_bytealign (w2[0], w2[1], offset); + w2[2] = amd_bytealign (w1[3], w2[0], offset); + w2[1] = amd_bytealign (w1[2], w1[3], offset); + w2[0] = amd_bytealign (w1[1], w1[2], offset); + w1[3] = amd_bytealign (w1[0], w1[1], offset); + w1[2] = amd_bytealign (w0[3], w1[0], offset); + w1[1] = amd_bytealign (w0[2], w0[3], offset); + w1[0] = amd_bytealign (w0[1], w0[2], offset); + w0[3] = amd_bytealign (w0[0], w0[1], offset); + w0[2] = amd_bytealign ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 3: - w7[3] = amd_bytealign (w7[0], w6[3], offset_minus_4); - w7[2] = amd_bytealign (w6[3], w6[2], offset_minus_4); - w7[1] = amd_bytealign (w6[2], w6[1], offset_minus_4); - w7[0] = amd_bytealign (w6[1], w6[0], offset_minus_4); - w6[3] = amd_bytealign (w6[0], w5[3], offset_minus_4); - w6[2] = amd_bytealign (w5[3], w5[2], offset_minus_4); - w6[1] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w6[0] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w5[3] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w5[2] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w5[1] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w5[0] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w4[3] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w4[2] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w4[1] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w4[0] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w3[3] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w3[2] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w3[1] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w3[0] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w2[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w1[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w0[3] = amd_bytealign (w0[0], 0, offset_minus_4); + case 3: + w7[3] = amd_bytealign (w6[3], w7[0], offset); + w7[2] = amd_bytealign (w6[2], w6[3], offset); + w7[1] = amd_bytealign (w6[1], w6[2], offset); + w7[0] = amd_bytealign (w6[0], w6[1], offset); + w6[3] = amd_bytealign (w5[3], w6[0], offset); + w6[2] = amd_bytealign (w5[2], w5[3], offset); + w6[1] = amd_bytealign (w5[1], w5[2], offset); + w6[0] = amd_bytealign (w5[0], w5[1], offset); + w5[3] = amd_bytealign (w4[3], w5[0], offset); + w5[2] = amd_bytealign (w4[2], w4[3], offset); + w5[1] = amd_bytealign (w4[1], w4[2], offset); + w5[0] = amd_bytealign (w4[0], w4[1], offset); + w4[3] = amd_bytealign (w3[3], w4[0], offset); + w4[2] = amd_bytealign (w3[2], w3[3], offset); + w4[1] = amd_bytealign (w3[1], w3[2], offset); + w4[0] = amd_bytealign (w3[0], w3[1], offset); + w3[3] = amd_bytealign (w2[3], w3[0], offset); + w3[2] = amd_bytealign (w2[2], w2[3], offset); + w3[1] = amd_bytealign (w2[1], w2[2], offset); + w3[0] = amd_bytealign (w2[0], w2[1], offset); + w2[3] = amd_bytealign (w1[3], w2[0], offset); + w2[2] = amd_bytealign (w1[2], w1[3], offset); + w2[1] = amd_bytealign (w1[1], w1[2], offset); + w2[0] = amd_bytealign (w1[0], w1[1], offset); + w1[3] = amd_bytealign (w0[3], w1[0], offset); + w1[2] = amd_bytealign (w0[2], w0[3], offset); + w1[1] = amd_bytealign (w0[1], w0[2], offset); + w1[0] = amd_bytealign (w0[0], w0[1], offset); + w0[3] = amd_bytealign ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 4: - w7[3] = amd_bytealign (w6[3], w6[2], offset_minus_4); - w7[2] = amd_bytealign (w6[2], w6[1], offset_minus_4); - w7[1] = amd_bytealign (w6[1], w6[0], offset_minus_4); - w7[0] = amd_bytealign (w6[0], w5[3], offset_minus_4); - w6[3] = amd_bytealign (w5[3], w5[2], offset_minus_4); - w6[2] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w6[1] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w6[0] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w5[3] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w5[2] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w5[1] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w5[0] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w4[3] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w4[2] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w4[1] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w4[0] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w3[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w3[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w3[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w3[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w2[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w1[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[0] = amd_bytealign (w0[0], 0, offset_minus_4); + case 4: + w7[3] = amd_bytealign (w6[2], w6[3], offset); + w7[2] = amd_bytealign (w6[1], w6[2], offset); + w7[1] = amd_bytealign (w6[0], w6[1], offset); + w7[0] = amd_bytealign (w5[3], w6[0], offset); + w6[3] = amd_bytealign (w5[2], w5[3], offset); + w6[2] = amd_bytealign (w5[1], w5[2], offset); + w6[1] = amd_bytealign (w5[0], w5[1], offset); + w6[0] = amd_bytealign (w4[3], w5[0], offset); + w5[3] = amd_bytealign (w4[2], w4[3], offset); + w5[2] = amd_bytealign (w4[1], w4[2], offset); + w5[1] = amd_bytealign (w4[0], w4[1], offset); + w5[0] = amd_bytealign (w3[3], w4[0], offset); + w4[3] = amd_bytealign (w3[2], w3[3], offset); + w4[2] = amd_bytealign (w3[1], w3[2], offset); + w4[1] = amd_bytealign (w3[0], w3[1], offset); + w4[0] = amd_bytealign (w2[3], w3[0], offset); + w3[3] = amd_bytealign (w2[2], w2[3], offset); + w3[2] = amd_bytealign (w2[1], w2[2], offset); + w3[1] = amd_bytealign (w2[0], w2[1], offset); + w3[0] = amd_bytealign (w1[3], w2[0], offset); + w2[3] = amd_bytealign (w1[2], w1[3], offset); + w2[2] = amd_bytealign (w1[1], w1[2], offset); + w2[1] = amd_bytealign (w1[0], w1[1], offset); + w2[0] = amd_bytealign (w0[3], w1[0], offset); + w1[3] = amd_bytealign (w0[2], w0[3], offset); + w1[2] = amd_bytealign (w0[1], w0[2], offset); + w1[1] = amd_bytealign (w0[0], w0[1], offset); + w1[0] = amd_bytealign ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 5: - w7[3] = amd_bytealign (w6[2], w6[1], offset_minus_4); - w7[2] = amd_bytealign (w6[1], w6[0], offset_minus_4); - w7[1] = amd_bytealign (w6[0], w5[3], offset_minus_4); - w7[0] = amd_bytealign (w5[3], w5[2], offset_minus_4); - w6[3] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w6[2] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w6[1] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w6[0] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w5[3] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w5[2] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w5[1] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w5[0] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w4[3] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w4[2] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w4[1] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w4[0] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w3[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w3[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w3[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w3[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w2[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w1[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[1] = amd_bytealign (w0[0], 0, offset_minus_4); + case 5: + w7[3] = amd_bytealign (w6[1], w6[2], offset); + w7[2] = amd_bytealign (w6[0], w6[1], offset); + w7[1] = amd_bytealign (w5[3], w6[0], offset); + w7[0] = amd_bytealign (w5[2], w5[3], offset); + w6[3] = amd_bytealign (w5[1], w5[2], offset); + w6[2] = amd_bytealign (w5[0], w5[1], offset); + w6[1] = amd_bytealign (w4[3], w5[0], offset); + w6[0] = amd_bytealign (w4[2], w4[3], offset); + w5[3] = amd_bytealign (w4[1], w4[2], offset); + w5[2] = amd_bytealign (w4[0], w4[1], offset); + w5[1] = amd_bytealign (w3[3], w4[0], offset); + w5[0] = amd_bytealign (w3[2], w3[3], offset); + w4[3] = amd_bytealign (w3[1], w3[2], offset); + w4[2] = amd_bytealign (w3[0], w3[1], offset); + w4[1] = amd_bytealign (w2[3], w3[0], offset); + w4[0] = amd_bytealign (w2[2], w2[3], offset); + w3[3] = amd_bytealign (w2[1], w2[2], offset); + w3[2] = amd_bytealign (w2[0], w2[1], offset); + w3[1] = amd_bytealign (w1[3], w2[0], offset); + w3[0] = amd_bytealign (w1[2], w1[3], offset); + w2[3] = amd_bytealign (w1[1], w1[2], offset); + w2[2] = amd_bytealign (w1[0], w1[1], offset); + w2[1] = amd_bytealign (w0[3], w1[0], offset); + w2[0] = amd_bytealign (w0[2], w0[3], offset); + w1[3] = amd_bytealign (w0[1], w0[2], offset); + w1[2] = amd_bytealign (w0[0], w0[1], offset); + w1[1] = amd_bytealign ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 6: - w7[3] = amd_bytealign (w6[1], w6[0], offset_minus_4); - w7[2] = amd_bytealign (w6[0], w5[3], offset_minus_4); - w7[1] = amd_bytealign (w5[3], w5[2], offset_minus_4); - w7[0] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w6[3] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w6[2] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w6[1] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w6[0] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w5[3] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w5[2] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w5[1] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w5[0] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w4[3] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w4[2] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w4[1] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w4[0] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w3[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w3[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w3[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w3[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w2[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w1[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[2] = amd_bytealign (w0[0], 0, offset_minus_4); + case 6: + w7[3] = amd_bytealign (w6[0], w6[1], offset); + w7[2] = amd_bytealign (w5[3], w6[0], offset); + w7[1] = amd_bytealign (w5[2], w5[3], offset); + w7[0] = amd_bytealign (w5[1], w5[2], offset); + w6[3] = amd_bytealign (w5[0], w5[1], offset); + w6[2] = amd_bytealign (w4[3], w5[0], offset); + w6[1] = amd_bytealign (w4[2], w4[3], offset); + w6[0] = amd_bytealign (w4[1], w4[2], offset); + w5[3] = amd_bytealign (w4[0], w4[1], offset); + w5[2] = amd_bytealign (w3[3], w4[0], offset); + w5[1] = amd_bytealign (w3[2], w3[3], offset); + w5[0] = amd_bytealign (w3[1], w3[2], offset); + w4[3] = amd_bytealign (w3[0], w3[1], offset); + w4[2] = amd_bytealign (w2[3], w3[0], offset); + w4[1] = amd_bytealign (w2[2], w2[3], offset); + w4[0] = amd_bytealign (w2[1], w2[2], offset); + w3[3] = amd_bytealign (w2[0], w2[1], offset); + w3[2] = amd_bytealign (w1[3], w2[0], offset); + w3[1] = amd_bytealign (w1[2], w1[3], offset); + w3[0] = amd_bytealign (w1[1], w1[2], offset); + w2[3] = amd_bytealign (w1[0], w1[1], offset); + w2[2] = amd_bytealign (w0[3], w1[0], offset); + w2[1] = amd_bytealign (w0[2], w0[3], offset); + w2[0] = amd_bytealign (w0[1], w0[2], offset); + w1[3] = amd_bytealign (w0[0], w0[1], offset); + w1[2] = amd_bytealign ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -6640,64 +6820,34 @@ void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 7: - w7[3] = amd_bytealign (w6[0], w5[3], offset_minus_4); - w7[2] = amd_bytealign (w5[3], w5[2], offset_minus_4); - w7[1] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w7[0] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w6[3] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w6[2] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w6[1] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w6[0] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w5[3] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w5[2] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w5[1] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w5[0] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w4[3] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w4[2] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w4[1] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w4[0] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w3[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w3[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w3[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w3[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w2[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w1[3] = amd_bytealign (w0[0], 0, offset_minus_4); + case 7: + w7[3] = amd_bytealign (w5[3], w6[0], offset); + w7[2] = amd_bytealign (w5[2], w5[3], offset); + w7[1] = amd_bytealign (w5[1], w5[2], offset); + w7[0] = amd_bytealign (w5[0], w5[1], offset); + w6[3] = amd_bytealign (w4[3], w5[0], offset); + w6[2] = amd_bytealign (w4[2], w4[3], offset); + w6[1] = amd_bytealign (w4[1], w4[2], offset); + w6[0] = amd_bytealign (w4[0], w4[1], offset); + w5[3] = amd_bytealign (w3[3], w4[0], offset); + w5[2] = amd_bytealign (w3[2], w3[3], offset); + w5[1] = amd_bytealign (w3[1], w3[2], offset); + w5[0] = amd_bytealign (w3[0], w3[1], offset); + w4[3] = amd_bytealign (w2[3], w3[0], offset); + w4[2] = amd_bytealign (w2[2], w2[3], offset); + w4[1] = amd_bytealign (w2[1], w2[2], offset); + w4[0] = amd_bytealign (w2[0], w2[1], offset); + w3[3] = amd_bytealign (w1[3], w2[0], offset); + w3[2] = amd_bytealign (w1[2], w1[3], offset); + w3[1] = amd_bytealign (w1[1], w1[2], offset); + w3[0] = amd_bytealign (w1[0], w1[1], offset); + w2[3] = amd_bytealign (w0[3], w1[0], offset); + w2[2] = amd_bytealign (w0[2], w0[3], offset); + w2[1] = amd_bytealign (w0[1], w0[2], offset); + w2[0] = amd_bytealign (w0[0], w0[1], offset); + w1[3] = amd_bytealign ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -6706,62 +6856,33 @@ void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 8: - w7[3] = amd_bytealign (w5[3], w5[2], offset_minus_4); - w7[2] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w7[1] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w7[0] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w6[3] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w6[2] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w6[1] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w6[0] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w5[3] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w5[2] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w5[1] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w5[0] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w4[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w4[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w4[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w4[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w3[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w3[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w3[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w3[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w2[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[0] = amd_bytealign (w0[0], 0, offset_minus_4); + case 8: + w7[3] = amd_bytealign (w5[2], w5[3], offset); + w7[2] = amd_bytealign (w5[1], w5[2], offset); + w7[1] = amd_bytealign (w5[0], w5[1], offset); + w7[0] = amd_bytealign (w4[3], w5[0], offset); + w6[3] = amd_bytealign (w4[2], w4[3], offset); + w6[2] = amd_bytealign (w4[1], w4[2], offset); + w6[1] = amd_bytealign (w4[0], w4[1], offset); + w6[0] = amd_bytealign (w3[3], w4[0], offset); + w5[3] = amd_bytealign (w3[2], w3[3], offset); + w5[2] = amd_bytealign (w3[1], w3[2], offset); + w5[1] = amd_bytealign (w3[0], w3[1], offset); + w5[0] = amd_bytealign (w2[3], w3[0], offset); + w4[3] = amd_bytealign (w2[2], w2[3], offset); + w4[2] = amd_bytealign (w2[1], w2[2], offset); + w4[1] = amd_bytealign (w2[0], w2[1], offset); + w4[0] = amd_bytealign (w1[3], w2[0], offset); + w3[3] = amd_bytealign (w1[2], w1[3], offset); + w3[2] = amd_bytealign (w1[1], w1[2], offset); + w3[1] = amd_bytealign (w1[0], w1[1], offset); + w3[0] = amd_bytealign (w0[3], w1[0], offset); + w2[3] = amd_bytealign (w0[2], w0[3], offset); + w2[2] = amd_bytealign (w0[1], w0[2], offset); + w2[1] = amd_bytealign (w0[0], w0[1], offset); + w2[0] = amd_bytealign ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -6771,60 +6892,32 @@ void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 9: - w7[3] = amd_bytealign (w5[2], w5[1], offset_minus_4); - w7[2] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w7[1] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w7[0] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w6[3] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w6[2] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w6[1] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w6[0] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w5[3] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w5[2] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w5[1] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w5[0] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w4[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w4[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w4[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w4[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w3[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w3[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w3[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w3[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w2[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[1] = amd_bytealign (w0[0], 0, offset_minus_4); + case 9: + w7[3] = amd_bytealign (w5[1], w5[2], offset); + w7[2] = amd_bytealign (w5[0], w5[1], offset); + w7[1] = amd_bytealign (w4[3], w5[0], offset); + w7[0] = amd_bytealign (w4[2], w4[3], offset); + w6[3] = amd_bytealign (w4[1], w4[2], offset); + w6[2] = amd_bytealign (w4[0], w4[1], offset); + w6[1] = amd_bytealign (w3[3], w4[0], offset); + w6[0] = amd_bytealign (w3[2], w3[3], offset); + w5[3] = amd_bytealign (w3[1], w3[2], offset); + w5[2] = amd_bytealign (w3[0], w3[1], offset); + w5[1] = amd_bytealign (w2[3], w3[0], offset); + w5[0] = amd_bytealign (w2[2], w2[3], offset); + w4[3] = amd_bytealign (w2[1], w2[2], offset); + w4[2] = amd_bytealign (w2[0], w2[1], offset); + w4[1] = amd_bytealign (w1[3], w2[0], offset); + w4[0] = amd_bytealign (w1[2], w1[3], offset); + w3[3] = amd_bytealign (w1[1], w1[2], offset); + w3[2] = amd_bytealign (w1[0], w1[1], offset); + w3[1] = amd_bytealign (w0[3], w1[0], offset); + w3[0] = amd_bytealign (w0[2], w0[3], offset); + w2[3] = amd_bytealign (w0[1], w0[2], offset); + w2[2] = amd_bytealign (w0[0], w0[1], offset); + w2[1] = amd_bytealign ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -6835,58 +6928,31 @@ void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 10: - w7[3] = amd_bytealign (w5[1], w5[0], offset_minus_4); - w7[2] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w7[1] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w7[0] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w6[3] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w6[2] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w6[1] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w6[0] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w5[3] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w5[2] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w5[1] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w5[0] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w4[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w4[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w4[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w4[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w3[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w3[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w3[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w3[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w2[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[2] = amd_bytealign (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign (w5[0], w5[1], offset); + w7[2] = amd_bytealign (w4[3], w5[0], offset); + w7[1] = amd_bytealign (w4[2], w4[3], offset); + w7[0] = amd_bytealign (w4[1], w4[2], offset); + w6[3] = amd_bytealign (w4[0], w4[1], offset); + w6[2] = amd_bytealign (w3[3], w4[0], offset); + w6[1] = amd_bytealign (w3[2], w3[3], offset); + w6[0] = amd_bytealign (w3[1], w3[2], offset); + w5[3] = amd_bytealign (w3[0], w3[1], offset); + w5[2] = amd_bytealign (w2[3], w3[0], offset); + w5[1] = amd_bytealign (w2[2], w2[3], offset); + w5[0] = amd_bytealign (w2[1], w2[2], offset); + w4[3] = amd_bytealign (w2[0], w2[1], offset); + w4[2] = amd_bytealign (w1[3], w2[0], offset); + w4[1] = amd_bytealign (w1[2], w1[3], offset); + w4[0] = amd_bytealign (w1[1], w1[2], offset); + w3[3] = amd_bytealign (w1[0], w1[1], offset); + w3[2] = amd_bytealign (w0[3], w1[0], offset); + w3[1] = amd_bytealign (w0[2], w0[3], offset); + w3[0] = amd_bytealign (w0[1], w0[2], offset); + w2[3] = amd_bytealign (w0[0], w0[1], offset); + w2[2] = amd_bytealign ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -6898,56 +6964,30 @@ void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 11: - w7[3] = amd_bytealign (w5[0], w5[3], offset_minus_4); - w7[2] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w7[1] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w7[0] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w6[3] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w6[2] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w6[1] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w6[0] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w5[3] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w5[2] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w5[1] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w5[0] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w4[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w4[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w4[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w4[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w3[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w3[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w3[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w3[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w2[3] = amd_bytealign (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign (w4[3], w5[0], offset); + w7[2] = amd_bytealign (w4[2], w4[3], offset); + w7[1] = amd_bytealign (w4[1], w4[2], offset); + w7[0] = amd_bytealign (w4[0], w4[1], offset); + w6[3] = amd_bytealign (w3[3], w4[0], offset); + w6[2] = amd_bytealign (w3[2], w3[3], offset); + w6[1] = amd_bytealign (w3[1], w3[2], offset); + w6[0] = amd_bytealign (w3[0], w3[1], offset); + w5[3] = amd_bytealign (w2[3], w3[0], offset); + w5[2] = amd_bytealign (w2[2], w2[3], offset); + w5[1] = amd_bytealign (w2[1], w2[2], offset); + w5[0] = amd_bytealign (w2[0], w2[1], offset); + w4[3] = amd_bytealign (w1[3], w2[0], offset); + w4[2] = amd_bytealign (w1[2], w1[3], offset); + w4[1] = amd_bytealign (w1[1], w1[2], offset); + w4[0] = amd_bytealign (w1[0], w1[1], offset); + w3[3] = amd_bytealign (w0[3], w1[0], offset); + w3[2] = amd_bytealign (w0[2], w0[3], offset); + w3[1] = amd_bytealign (w0[1], w0[2], offset); + w3[0] = amd_bytealign (w0[0], w0[1], offset); + w2[3] = amd_bytealign ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -6960,54 +7000,29 @@ void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 12: - w7[3] = amd_bytealign (w4[3], w4[2], offset_minus_4); - w7[2] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w7[1] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w7[0] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w6[3] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w6[2] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w6[1] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w6[0] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w5[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w5[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w5[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w5[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w4[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w4[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w4[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w4[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w3[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w3[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w3[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w3[0] = amd_bytealign (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign (w4[2], w4[3], offset); + w7[2] = amd_bytealign (w4[1], w4[2], offset); + w7[1] = amd_bytealign (w4[0], w4[1], offset); + w7[0] = amd_bytealign (w3[3], w4[0], offset); + w6[3] = amd_bytealign (w3[2], w3[3], offset); + w6[2] = amd_bytealign (w3[1], w3[2], offset); + w6[1] = amd_bytealign (w3[0], w3[1], offset); + w6[0] = amd_bytealign (w2[3], w3[0], offset); + w5[3] = amd_bytealign (w2[2], w2[3], offset); + w5[2] = amd_bytealign (w2[1], w2[2], offset); + w5[1] = amd_bytealign (w2[0], w2[1], offset); + w5[0] = amd_bytealign (w1[3], w2[0], offset); + w4[3] = amd_bytealign (w1[2], w1[3], offset); + w4[2] = amd_bytealign (w1[1], w1[2], offset); + w4[1] = amd_bytealign (w1[0], w1[1], offset); + w4[0] = amd_bytealign (w0[3], w1[0], offset); + w3[3] = amd_bytealign (w0[2], w0[3], offset); + w3[2] = amd_bytealign (w0[1], w0[2], offset); + w3[1] = amd_bytealign (w0[0], w0[1], offset); + w3[0] = amd_bytealign ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -7021,52 +7036,28 @@ void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 13: - w7[3] = amd_bytealign (w4[2], w4[1], offset_minus_4); - w7[2] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w7[1] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w7[0] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w6[3] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w6[2] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w6[1] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w6[0] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w5[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w5[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w5[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w5[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w4[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w4[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w4[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w4[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w3[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w3[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w3[1] = amd_bytealign (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign (w4[1], w4[2], offset); + w7[2] = amd_bytealign (w4[0], w4[1], offset); + w7[1] = amd_bytealign (w3[3], w4[0], offset); + w7[0] = amd_bytealign (w3[2], w3[3], offset); + w6[3] = amd_bytealign (w3[1], w3[2], offset); + w6[2] = amd_bytealign (w3[0], w3[1], offset); + w6[1] = amd_bytealign (w2[3], w3[0], offset); + w6[0] = amd_bytealign (w2[2], w2[3], offset); + w5[3] = amd_bytealign (w2[1], w2[2], offset); + w5[2] = amd_bytealign (w2[0], w2[1], offset); + w5[1] = amd_bytealign (w1[3], w2[0], offset); + w5[0] = amd_bytealign (w1[2], w1[3], offset); + w4[3] = amd_bytealign (w1[1], w1[2], offset); + w4[2] = amd_bytealign (w1[0], w1[1], offset); + w4[1] = amd_bytealign (w0[3], w1[0], offset); + w4[0] = amd_bytealign (w0[2], w0[3], offset); + w3[3] = amd_bytealign (w0[1], w0[2], offset); + w3[2] = amd_bytealign (w0[0], w0[1], offset); + w3[1] = amd_bytealign ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -7081,50 +7072,27 @@ void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 14: - w7[3] = amd_bytealign (w4[1], w4[0], offset_minus_4); - w7[2] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w7[1] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w7[0] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w6[3] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w6[2] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w6[1] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w6[0] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w5[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w5[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w5[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w5[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w4[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w4[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w4[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w4[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w3[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w3[2] = amd_bytealign (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign (w4[0], w4[1], offset); + w7[2] = amd_bytealign (w3[3], w4[0], offset); + w7[1] = amd_bytealign (w3[2], w3[3], offset); + w7[0] = amd_bytealign (w3[1], w3[2], offset); + w6[3] = amd_bytealign (w3[0], w3[1], offset); + w6[2] = amd_bytealign (w2[3], w3[0], offset); + w6[1] = amd_bytealign (w2[2], w2[3], offset); + w6[0] = amd_bytealign (w2[1], w2[2], offset); + w5[3] = amd_bytealign (w2[0], w2[1], offset); + w5[2] = amd_bytealign (w1[3], w2[0], offset); + w5[1] = amd_bytealign (w1[2], w1[3], offset); + w5[0] = amd_bytealign (w1[1], w1[2], offset); + w4[3] = amd_bytealign (w1[0], w1[1], offset); + w4[2] = amd_bytealign (w0[3], w1[0], offset); + w4[1] = amd_bytealign (w0[2], w0[3], offset); + w4[0] = amd_bytealign (w0[1], w0[2], offset); + w3[3] = amd_bytealign (w0[0], w0[1], offset); + w3[2] = amd_bytealign ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -7140,48 +7108,26 @@ void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 15: - w7[3] = amd_bytealign (w4[0], w3[3], offset_minus_4); - w7[2] = amd_bytealign (w3[3], w3[2], offset_minus_4); - w7[1] = amd_bytealign (w3[2], w3[1], offset_minus_4); - w7[0] = amd_bytealign (w3[1], w3[0], offset_minus_4); - w6[3] = amd_bytealign (w3[0], w2[3], offset_minus_4); - w6[2] = amd_bytealign (w2[3], w2[2], offset_minus_4); - w6[1] = amd_bytealign (w2[2], w2[1], offset_minus_4); - w6[0] = amd_bytealign (w2[1], w2[0], offset_minus_4); - w5[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); - w5[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); - w5[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); - w5[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); - w4[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); - w4[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); - w4[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); - w4[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); - w3[3] = amd_bytealign (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign (w3[3], w4[0], offset); + w7[2] = amd_bytealign (w3[2], w3[3], offset); + w7[1] = amd_bytealign (w3[1], w3[2], offset); + w7[0] = amd_bytealign (w3[0], w3[1], offset); + w6[3] = amd_bytealign (w2[3], w3[0], offset); + w6[2] = amd_bytealign (w2[2], w2[3], offset); + w6[1] = amd_bytealign (w2[1], w2[2], offset); + w6[0] = amd_bytealign (w2[0], w2[1], offset); + w5[3] = amd_bytealign (w1[3], w2[0], offset); + w5[2] = amd_bytealign (w1[2], w1[3], offset); + w5[1] = amd_bytealign (w1[1], w1[2], offset); + w5[0] = amd_bytealign (w1[0], w1[1], offset); + w4[3] = amd_bytealign (w0[3], w1[0], offset); + w4[2] = amd_bytealign (w0[2], w0[3], offset); + w4[1] = amd_bytealign (w0[1], w0[2], offset); + w4[0] = amd_bytealign (w0[0], w0[1], offset); + w3[3] = amd_bytealign ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -7198,29 +7144,617 @@ void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } + break; + + case 16: + w7[3] = amd_bytealign (w3[2], w3[3], offset); + w7[2] = amd_bytealign (w3[1], w3[2], offset); + w7[1] = amd_bytealign (w3[0], w3[1], offset); + w7[0] = amd_bytealign (w2[3], w3[0], offset); + w6[3] = amd_bytealign (w2[2], w2[3], offset); + w6[2] = amd_bytealign (w2[1], w2[2], offset); + w6[1] = amd_bytealign (w2[0], w2[1], offset); + w6[0] = amd_bytealign (w1[3], w2[0], offset); + w5[3] = amd_bytealign (w1[2], w1[3], offset); + w5[2] = amd_bytealign (w1[1], w1[2], offset); + w5[1] = amd_bytealign (w1[0], w1[1], offset); + w5[0] = amd_bytealign (w0[3], w1[0], offset); + w4[3] = amd_bytealign (w0[2], w0[3], offset); + w4[2] = amd_bytealign (w0[1], w0[2], offset); + w4[1] = amd_bytealign (w0[0], w0[1], offset); + w4[0] = amd_bytealign ( 0, w0[0], offset); + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 17: + w7[3] = amd_bytealign (w3[1], w3[2], offset); + w7[2] = amd_bytealign (w3[0], w3[1], offset); + w7[1] = amd_bytealign (w2[3], w3[0], offset); + w7[0] = amd_bytealign (w2[2], w2[3], offset); + w6[3] = amd_bytealign (w2[1], w2[2], offset); + w6[2] = amd_bytealign (w2[0], w2[1], offset); + w6[1] = amd_bytealign (w1[3], w2[0], offset); + w6[0] = amd_bytealign (w1[2], w1[3], offset); + w5[3] = amd_bytealign (w1[1], w1[2], offset); + w5[2] = amd_bytealign (w1[0], w1[1], offset); + w5[1] = amd_bytealign (w0[3], w1[0], offset); + w5[0] = amd_bytealign (w0[2], w0[3], offset); + w4[3] = amd_bytealign (w0[1], w0[2], offset); + w4[2] = amd_bytealign (w0[0], w0[1], offset); + w4[1] = amd_bytealign ( 0, w0[0], offset); + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 18: + w7[3] = amd_bytealign (w3[0], w3[1], offset); + w7[2] = amd_bytealign (w2[3], w3[0], offset); + w7[1] = amd_bytealign (w2[2], w2[3], offset); + w7[0] = amd_bytealign (w2[1], w2[2], offset); + w6[3] = amd_bytealign (w2[0], w2[1], offset); + w6[2] = amd_bytealign (w1[3], w2[0], offset); + w6[1] = amd_bytealign (w1[2], w1[3], offset); + w6[0] = amd_bytealign (w1[1], w1[2], offset); + w5[3] = amd_bytealign (w1[0], w1[1], offset); + w5[2] = amd_bytealign (w0[3], w1[0], offset); + w5[1] = amd_bytealign (w0[2], w0[3], offset); + w5[0] = amd_bytealign (w0[1], w0[2], offset); + w4[3] = amd_bytealign (w0[0], w0[1], offset); + w4[2] = amd_bytealign ( 0, w0[0], offset); + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 19: + w7[3] = amd_bytealign (w2[3], w3[0], offset); + w7[2] = amd_bytealign (w2[2], w2[3], offset); + w7[1] = amd_bytealign (w2[1], w2[2], offset); + w7[0] = amd_bytealign (w2[0], w2[1], offset); + w6[3] = amd_bytealign (w1[3], w2[0], offset); + w6[2] = amd_bytealign (w1[2], w1[3], offset); + w6[1] = amd_bytealign (w1[1], w1[2], offset); + w6[0] = amd_bytealign (w1[0], w1[1], offset); + w5[3] = amd_bytealign (w0[3], w1[0], offset); + w5[2] = amd_bytealign (w0[2], w0[3], offset); + w5[1] = amd_bytealign (w0[1], w0[2], offset); + w5[0] = amd_bytealign (w0[0], w0[1], offset); + w4[3] = amd_bytealign ( 0, w0[0], offset); + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 20: + w7[3] = amd_bytealign (w2[2], w2[3], offset); + w7[2] = amd_bytealign (w2[1], w2[2], offset); + w7[1] = amd_bytealign (w2[0], w2[1], offset); + w7[0] = amd_bytealign (w1[3], w2[0], offset); + w6[3] = amd_bytealign (w1[2], w1[3], offset); + w6[2] = amd_bytealign (w1[1], w1[2], offset); + w6[1] = amd_bytealign (w1[0], w1[1], offset); + w6[0] = amd_bytealign (w0[3], w1[0], offset); + w5[3] = amd_bytealign (w0[2], w0[3], offset); + w5[2] = amd_bytealign (w0[1], w0[2], offset); + w5[1] = amd_bytealign (w0[0], w0[1], offset); + w5[0] = amd_bytealign ( 0, w0[0], offset); + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 21: + w7[3] = amd_bytealign (w2[1], w2[2], offset); + w7[2] = amd_bytealign (w2[0], w2[1], offset); + w7[1] = amd_bytealign (w1[3], w2[0], offset); + w7[0] = amd_bytealign (w1[2], w1[3], offset); + w6[3] = amd_bytealign (w1[1], w1[2], offset); + w6[2] = amd_bytealign (w1[0], w1[1], offset); + w6[1] = amd_bytealign (w0[3], w1[0], offset); + w6[0] = amd_bytealign (w0[2], w0[3], offset); + w5[3] = amd_bytealign (w0[1], w0[2], offset); + w5[2] = amd_bytealign (w0[0], w0[1], offset); + w5[1] = amd_bytealign ( 0, w0[0], offset); + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 22: + w7[3] = amd_bytealign (w2[0], w2[1], offset); + w7[2] = amd_bytealign (w1[3], w2[0], offset); + w7[1] = amd_bytealign (w1[2], w1[3], offset); + w7[0] = amd_bytealign (w1[1], w1[2], offset); + w6[3] = amd_bytealign (w1[0], w1[1], offset); + w6[2] = amd_bytealign (w0[3], w1[0], offset); + w6[1] = amd_bytealign (w0[2], w0[3], offset); + w6[0] = amd_bytealign (w0[1], w0[2], offset); + w5[3] = amd_bytealign (w0[0], w0[1], offset); + w5[2] = amd_bytealign ( 0, w0[0], offset); + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 23: + w7[3] = amd_bytealign (w1[3], w2[0], offset); + w7[2] = amd_bytealign (w1[2], w1[3], offset); + w7[1] = amd_bytealign (w1[1], w1[2], offset); + w7[0] = amd_bytealign (w1[0], w1[1], offset); + w6[3] = amd_bytealign (w0[3], w1[0], offset); + w6[2] = amd_bytealign (w0[2], w0[3], offset); + w6[1] = amd_bytealign (w0[1], w0[2], offset); + w6[0] = amd_bytealign (w0[0], w0[1], offset); + w5[3] = amd_bytealign ( 0, w0[0], offset); + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 24: + w7[3] = amd_bytealign (w1[2], w1[3], offset); + w7[2] = amd_bytealign (w1[1], w1[2], offset); + w7[1] = amd_bytealign (w1[0], w1[1], offset); + w7[0] = amd_bytealign (w0[3], w1[0], offset); + w6[3] = amd_bytealign (w0[2], w0[3], offset); + w6[2] = amd_bytealign (w0[1], w0[2], offset); + w6[1] = amd_bytealign (w0[0], w0[1], offset); + w6[0] = amd_bytealign ( 0, w0[0], offset); + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 25: + w7[3] = amd_bytealign (w1[1], w1[2], offset); + w7[2] = amd_bytealign (w1[0], w1[1], offset); + w7[1] = amd_bytealign (w0[3], w1[0], offset); + w7[0] = amd_bytealign (w0[2], w0[3], offset); + w6[3] = amd_bytealign (w0[1], w0[2], offset); + w6[2] = amd_bytealign (w0[0], w0[1], offset); + w6[1] = amd_bytealign ( 0, w0[0], offset); + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 26: + w7[3] = amd_bytealign (w1[0], w1[1], offset); + w7[2] = amd_bytealign (w0[3], w1[0], offset); + w7[1] = amd_bytealign (w0[2], w0[3], offset); + w7[0] = amd_bytealign (w0[1], w0[2], offset); + w6[3] = amd_bytealign (w0[0], w0[1], offset); + w6[2] = amd_bytealign ( 0, w0[0], offset); + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 27: + w7[3] = amd_bytealign (w0[3], w1[0], offset); + w7[2] = amd_bytealign (w0[2], w0[3], offset); + w7[1] = amd_bytealign (w0[1], w0[2], offset); + w7[0] = amd_bytealign (w0[0], w0[1], offset); + w6[3] = amd_bytealign ( 0, w0[0], offset); + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 28: + w7[3] = amd_bytealign (w0[2], w0[3], offset); + w7[2] = amd_bytealign (w0[1], w0[2], offset); + w7[1] = amd_bytealign (w0[0], w0[1], offset); + w7[0] = amd_bytealign ( 0, w0[0], offset); + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 29: + w7[3] = amd_bytealign (w0[1], w0[2], offset); + w7[2] = amd_bytealign (w0[0], w0[1], offset); + w7[1] = amd_bytealign ( 0, w0[0], offset); + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 30: + w7[3] = amd_bytealign (w0[0], w0[1], offset); + w7[2] = amd_bytealign ( 0, w0[0], offset); + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 31: + w7[3] = amd_bytealign ( 0, w0[0], offset); + w7[2] = 0; + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; } + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + w4[0] = swap32 (w4[0]); + w4[1] = swap32 (w4[1]); + w4[2] = swap32 (w4[2]); + w4[3] = swap32 (w4[3]); + w5[0] = swap32 (w5[0]); + w5[1] = swap32 (w5[1]); + w5[2] = swap32 (w5[2]); + w5[3] = swap32 (w5[3]); + w6[0] = swap32 (w6[0]); + w6[1] = swap32 (w6[1]); + w6[2] = swap32 (w6[2]); + w6[3] = swap32 (w6[3]); + w7[0] = swap32 (w7[0]); + w7[1] = swap32 (w7[1]); + w7[2] = swap32 (w7[2]); + w7[3] = swap32 (w7[3]); #endif #ifdef IS_NV @@ -13485,6 +14019,17456 @@ void switch_buffer_by_offset_8x4_carry_be (u32x w0[4], u32x w1[4], u32x w2[4], u #endif } +void switch_buffer_by_offset_1x64_le (u32x w[64], const u32 offset) +{ + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + + #if defined IS_AMD || defined IS_GENERIC + + #pragma unroll + for (int i = 0; i < 64; i++) w[i] = swap32 (w[i]); + + switch (offset / 4) + { + case 0: + w[63] = amd_bytealign (w[62], w[63], offset); + w[62] = amd_bytealign (w[61], w[62], offset); + w[61] = amd_bytealign (w[60], w[61], offset); + w[60] = amd_bytealign (w[59], w[60], offset); + w[59] = amd_bytealign (w[58], w[59], offset); + w[58] = amd_bytealign (w[57], w[58], offset); + w[57] = amd_bytealign (w[56], w[57], offset); + w[56] = amd_bytealign (w[55], w[56], offset); + w[55] = amd_bytealign (w[54], w[55], offset); + w[54] = amd_bytealign (w[53], w[54], offset); + w[53] = amd_bytealign (w[52], w[53], offset); + w[52] = amd_bytealign (w[51], w[52], offset); + w[51] = amd_bytealign (w[50], w[51], offset); + w[50] = amd_bytealign (w[49], w[50], offset); + w[49] = amd_bytealign (w[48], w[49], offset); + w[48] = amd_bytealign (w[47], w[48], offset); + w[47] = amd_bytealign (w[46], w[47], offset); + w[46] = amd_bytealign (w[45], w[46], offset); + w[45] = amd_bytealign (w[44], w[45], offset); + w[44] = amd_bytealign (w[43], w[44], offset); + w[43] = amd_bytealign (w[42], w[43], offset); + w[42] = amd_bytealign (w[41], w[42], offset); + w[41] = amd_bytealign (w[40], w[41], offset); + w[40] = amd_bytealign (w[39], w[40], offset); + w[39] = amd_bytealign (w[38], w[39], offset); + w[38] = amd_bytealign (w[37], w[38], offset); + w[37] = amd_bytealign (w[36], w[37], offset); + w[36] = amd_bytealign (w[35], w[36], offset); + w[35] = amd_bytealign (w[34], w[35], offset); + w[34] = amd_bytealign (w[33], w[34], offset); + w[33] = amd_bytealign (w[32], w[33], offset); + w[32] = amd_bytealign (w[31], w[32], offset); + w[31] = amd_bytealign (w[30], w[31], offset); + w[30] = amd_bytealign (w[29], w[30], offset); + w[29] = amd_bytealign (w[28], w[29], offset); + w[28] = amd_bytealign (w[27], w[28], offset); + w[27] = amd_bytealign (w[26], w[27], offset); + w[26] = amd_bytealign (w[25], w[26], offset); + w[25] = amd_bytealign (w[24], w[25], offset); + w[24] = amd_bytealign (w[23], w[24], offset); + w[23] = amd_bytealign (w[22], w[23], offset); + w[22] = amd_bytealign (w[21], w[22], offset); + w[21] = amd_bytealign (w[20], w[21], offset); + w[20] = amd_bytealign (w[19], w[20], offset); + w[19] = amd_bytealign (w[18], w[19], offset); + w[18] = amd_bytealign (w[17], w[18], offset); + w[17] = amd_bytealign (w[16], w[17], offset); + w[16] = amd_bytealign (w[15], w[16], offset); + w[15] = amd_bytealign (w[14], w[15], offset); + w[14] = amd_bytealign (w[13], w[14], offset); + w[13] = amd_bytealign (w[12], w[13], offset); + w[12] = amd_bytealign (w[11], w[12], offset); + w[11] = amd_bytealign (w[10], w[11], offset); + w[10] = amd_bytealign (w[ 9], w[10], offset); + w[ 9] = amd_bytealign (w[ 8], w[ 9], offset); + w[ 8] = amd_bytealign (w[ 7], w[ 8], offset); + w[ 7] = amd_bytealign (w[ 6], w[ 7], offset); + w[ 6] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 5] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 4] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 3] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 2] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 1] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 0] = amd_bytealign ( 0, w[ 0], offset); + + break; + + case 1: + w[63] = amd_bytealign (w[61], w[62], offset); + w[62] = amd_bytealign (w[60], w[61], offset); + w[61] = amd_bytealign (w[59], w[60], offset); + w[60] = amd_bytealign (w[58], w[59], offset); + w[59] = amd_bytealign (w[57], w[58], offset); + w[58] = amd_bytealign (w[56], w[57], offset); + w[57] = amd_bytealign (w[55], w[56], offset); + w[56] = amd_bytealign (w[54], w[55], offset); + w[55] = amd_bytealign (w[53], w[54], offset); + w[54] = amd_bytealign (w[52], w[53], offset); + w[53] = amd_bytealign (w[51], w[52], offset); + w[52] = amd_bytealign (w[50], w[51], offset); + w[51] = amd_bytealign (w[49], w[50], offset); + w[50] = amd_bytealign (w[48], w[49], offset); + w[49] = amd_bytealign (w[47], w[48], offset); + w[48] = amd_bytealign (w[46], w[47], offset); + w[47] = amd_bytealign (w[45], w[46], offset); + w[46] = amd_bytealign (w[44], w[45], offset); + w[45] = amd_bytealign (w[43], w[44], offset); + w[44] = amd_bytealign (w[42], w[43], offset); + w[43] = amd_bytealign (w[41], w[42], offset); + w[42] = amd_bytealign (w[40], w[41], offset); + w[41] = amd_bytealign (w[39], w[40], offset); + w[40] = amd_bytealign (w[38], w[39], offset); + w[39] = amd_bytealign (w[37], w[38], offset); + w[38] = amd_bytealign (w[36], w[37], offset); + w[37] = amd_bytealign (w[35], w[36], offset); + w[36] = amd_bytealign (w[34], w[35], offset); + w[35] = amd_bytealign (w[33], w[34], offset); + w[34] = amd_bytealign (w[32], w[33], offset); + w[33] = amd_bytealign (w[31], w[32], offset); + w[32] = amd_bytealign (w[30], w[31], offset); + w[31] = amd_bytealign (w[29], w[30], offset); + w[30] = amd_bytealign (w[28], w[29], offset); + w[29] = amd_bytealign (w[27], w[28], offset); + w[28] = amd_bytealign (w[26], w[27], offset); + w[27] = amd_bytealign (w[25], w[26], offset); + w[26] = amd_bytealign (w[24], w[25], offset); + w[25] = amd_bytealign (w[23], w[24], offset); + w[24] = amd_bytealign (w[22], w[23], offset); + w[23] = amd_bytealign (w[21], w[22], offset); + w[22] = amd_bytealign (w[20], w[21], offset); + w[21] = amd_bytealign (w[19], w[20], offset); + w[20] = amd_bytealign (w[18], w[19], offset); + w[19] = amd_bytealign (w[17], w[18], offset); + w[18] = amd_bytealign (w[16], w[17], offset); + w[17] = amd_bytealign (w[15], w[16], offset); + w[16] = amd_bytealign (w[14], w[15], offset); + w[15] = amd_bytealign (w[13], w[14], offset); + w[14] = amd_bytealign (w[12], w[13], offset); + w[13] = amd_bytealign (w[11], w[12], offset); + w[12] = amd_bytealign (w[10], w[11], offset); + w[11] = amd_bytealign (w[ 9], w[10], offset); + w[10] = amd_bytealign (w[ 8], w[ 9], offset); + w[ 9] = amd_bytealign (w[ 7], w[ 8], offset); + w[ 8] = amd_bytealign (w[ 6], w[ 7], offset); + w[ 7] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 6] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 5] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 4] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 3] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 2] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 1] = amd_bytealign ( 0, w[ 0], offset); + w[ 0] = 0; + + break; + + case 2: + w[63] = amd_bytealign (w[60], w[61], offset); + w[62] = amd_bytealign (w[59], w[60], offset); + w[61] = amd_bytealign (w[58], w[59], offset); + w[60] = amd_bytealign (w[57], w[58], offset); + w[59] = amd_bytealign (w[56], w[57], offset); + w[58] = amd_bytealign (w[55], w[56], offset); + w[57] = amd_bytealign (w[54], w[55], offset); + w[56] = amd_bytealign (w[53], w[54], offset); + w[55] = amd_bytealign (w[52], w[53], offset); + w[54] = amd_bytealign (w[51], w[52], offset); + w[53] = amd_bytealign (w[50], w[51], offset); + w[52] = amd_bytealign (w[49], w[50], offset); + w[51] = amd_bytealign (w[48], w[49], offset); + w[50] = amd_bytealign (w[47], w[48], offset); + w[49] = amd_bytealign (w[46], w[47], offset); + w[48] = amd_bytealign (w[45], w[46], offset); + w[47] = amd_bytealign (w[44], w[45], offset); + w[46] = amd_bytealign (w[43], w[44], offset); + w[45] = amd_bytealign (w[42], w[43], offset); + w[44] = amd_bytealign (w[41], w[42], offset); + w[43] = amd_bytealign (w[40], w[41], offset); + w[42] = amd_bytealign (w[39], w[40], offset); + w[41] = amd_bytealign (w[38], w[39], offset); + w[40] = amd_bytealign (w[37], w[38], offset); + w[39] = amd_bytealign (w[36], w[37], offset); + w[38] = amd_bytealign (w[35], w[36], offset); + w[37] = amd_bytealign (w[34], w[35], offset); + w[36] = amd_bytealign (w[33], w[34], offset); + w[35] = amd_bytealign (w[32], w[33], offset); + w[34] = amd_bytealign (w[31], w[32], offset); + w[33] = amd_bytealign (w[30], w[31], offset); + w[32] = amd_bytealign (w[29], w[30], offset); + w[31] = amd_bytealign (w[28], w[29], offset); + w[30] = amd_bytealign (w[27], w[28], offset); + w[29] = amd_bytealign (w[26], w[27], offset); + w[28] = amd_bytealign (w[25], w[26], offset); + w[27] = amd_bytealign (w[24], w[25], offset); + w[26] = amd_bytealign (w[23], w[24], offset); + w[25] = amd_bytealign (w[22], w[23], offset); + w[24] = amd_bytealign (w[21], w[22], offset); + w[23] = amd_bytealign (w[20], w[21], offset); + w[22] = amd_bytealign (w[19], w[20], offset); + w[21] = amd_bytealign (w[18], w[19], offset); + w[20] = amd_bytealign (w[17], w[18], offset); + w[19] = amd_bytealign (w[16], w[17], offset); + w[18] = amd_bytealign (w[15], w[16], offset); + w[17] = amd_bytealign (w[14], w[15], offset); + w[16] = amd_bytealign (w[13], w[14], offset); + w[15] = amd_bytealign (w[12], w[13], offset); + w[14] = amd_bytealign (w[11], w[12], offset); + w[13] = amd_bytealign (w[10], w[11], offset); + w[12] = amd_bytealign (w[ 9], w[10], offset); + w[11] = amd_bytealign (w[ 8], w[ 9], offset); + w[10] = amd_bytealign (w[ 7], w[ 8], offset); + w[ 9] = amd_bytealign (w[ 6], w[ 7], offset); + w[ 8] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 7] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 6] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 5] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 4] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 3] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 2] = amd_bytealign ( 0, w[ 0], offset); + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 3: + w[63] = amd_bytealign (w[59], w[60], offset); + w[62] = amd_bytealign (w[58], w[59], offset); + w[61] = amd_bytealign (w[57], w[58], offset); + w[60] = amd_bytealign (w[56], w[57], offset); + w[59] = amd_bytealign (w[55], w[56], offset); + w[58] = amd_bytealign (w[54], w[55], offset); + w[57] = amd_bytealign (w[53], w[54], offset); + w[56] = amd_bytealign (w[52], w[53], offset); + w[55] = amd_bytealign (w[51], w[52], offset); + w[54] = amd_bytealign (w[50], w[51], offset); + w[53] = amd_bytealign (w[49], w[50], offset); + w[52] = amd_bytealign (w[48], w[49], offset); + w[51] = amd_bytealign (w[47], w[48], offset); + w[50] = amd_bytealign (w[46], w[47], offset); + w[49] = amd_bytealign (w[45], w[46], offset); + w[48] = amd_bytealign (w[44], w[45], offset); + w[47] = amd_bytealign (w[43], w[44], offset); + w[46] = amd_bytealign (w[42], w[43], offset); + w[45] = amd_bytealign (w[41], w[42], offset); + w[44] = amd_bytealign (w[40], w[41], offset); + w[43] = amd_bytealign (w[39], w[40], offset); + w[42] = amd_bytealign (w[38], w[39], offset); + w[41] = amd_bytealign (w[37], w[38], offset); + w[40] = amd_bytealign (w[36], w[37], offset); + w[39] = amd_bytealign (w[35], w[36], offset); + w[38] = amd_bytealign (w[34], w[35], offset); + w[37] = amd_bytealign (w[33], w[34], offset); + w[36] = amd_bytealign (w[32], w[33], offset); + w[35] = amd_bytealign (w[31], w[32], offset); + w[34] = amd_bytealign (w[30], w[31], offset); + w[33] = amd_bytealign (w[29], w[30], offset); + w[32] = amd_bytealign (w[28], w[29], offset); + w[31] = amd_bytealign (w[27], w[28], offset); + w[30] = amd_bytealign (w[26], w[27], offset); + w[29] = amd_bytealign (w[25], w[26], offset); + w[28] = amd_bytealign (w[24], w[25], offset); + w[27] = amd_bytealign (w[23], w[24], offset); + w[26] = amd_bytealign (w[22], w[23], offset); + w[25] = amd_bytealign (w[21], w[22], offset); + w[24] = amd_bytealign (w[20], w[21], offset); + w[23] = amd_bytealign (w[19], w[20], offset); + w[22] = amd_bytealign (w[18], w[19], offset); + w[21] = amd_bytealign (w[17], w[18], offset); + w[20] = amd_bytealign (w[16], w[17], offset); + w[19] = amd_bytealign (w[15], w[16], offset); + w[18] = amd_bytealign (w[14], w[15], offset); + w[17] = amd_bytealign (w[13], w[14], offset); + w[16] = amd_bytealign (w[12], w[13], offset); + w[15] = amd_bytealign (w[11], w[12], offset); + w[14] = amd_bytealign (w[10], w[11], offset); + w[13] = amd_bytealign (w[ 9], w[10], offset); + w[12] = amd_bytealign (w[ 8], w[ 9], offset); + w[11] = amd_bytealign (w[ 7], w[ 8], offset); + w[10] = amd_bytealign (w[ 6], w[ 7], offset); + w[ 9] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 8] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 7] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 6] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 5] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 4] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 3] = amd_bytealign ( 0, w[ 0], offset); + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 4: + w[63] = amd_bytealign (w[58], w[59], offset); + w[62] = amd_bytealign (w[57], w[58], offset); + w[61] = amd_bytealign (w[56], w[57], offset); + w[60] = amd_bytealign (w[55], w[56], offset); + w[59] = amd_bytealign (w[54], w[55], offset); + w[58] = amd_bytealign (w[53], w[54], offset); + w[57] = amd_bytealign (w[52], w[53], offset); + w[56] = amd_bytealign (w[51], w[52], offset); + w[55] = amd_bytealign (w[50], w[51], offset); + w[54] = amd_bytealign (w[49], w[50], offset); + w[53] = amd_bytealign (w[48], w[49], offset); + w[52] = amd_bytealign (w[47], w[48], offset); + w[51] = amd_bytealign (w[46], w[47], offset); + w[50] = amd_bytealign (w[45], w[46], offset); + w[49] = amd_bytealign (w[44], w[45], offset); + w[48] = amd_bytealign (w[43], w[44], offset); + w[47] = amd_bytealign (w[42], w[43], offset); + w[46] = amd_bytealign (w[41], w[42], offset); + w[45] = amd_bytealign (w[40], w[41], offset); + w[44] = amd_bytealign (w[39], w[40], offset); + w[43] = amd_bytealign (w[38], w[39], offset); + w[42] = amd_bytealign (w[37], w[38], offset); + w[41] = amd_bytealign (w[36], w[37], offset); + w[40] = amd_bytealign (w[35], w[36], offset); + w[39] = amd_bytealign (w[34], w[35], offset); + w[38] = amd_bytealign (w[33], w[34], offset); + w[37] = amd_bytealign (w[32], w[33], offset); + w[36] = amd_bytealign (w[31], w[32], offset); + w[35] = amd_bytealign (w[30], w[31], offset); + w[34] = amd_bytealign (w[29], w[30], offset); + w[33] = amd_bytealign (w[28], w[29], offset); + w[32] = amd_bytealign (w[27], w[28], offset); + w[31] = amd_bytealign (w[26], w[27], offset); + w[30] = amd_bytealign (w[25], w[26], offset); + w[29] = amd_bytealign (w[24], w[25], offset); + w[28] = amd_bytealign (w[23], w[24], offset); + w[27] = amd_bytealign (w[22], w[23], offset); + w[26] = amd_bytealign (w[21], w[22], offset); + w[25] = amd_bytealign (w[20], w[21], offset); + w[24] = amd_bytealign (w[19], w[20], offset); + w[23] = amd_bytealign (w[18], w[19], offset); + w[22] = amd_bytealign (w[17], w[18], offset); + w[21] = amd_bytealign (w[16], w[17], offset); + w[20] = amd_bytealign (w[15], w[16], offset); + w[19] = amd_bytealign (w[14], w[15], offset); + w[18] = amd_bytealign (w[13], w[14], offset); + w[17] = amd_bytealign (w[12], w[13], offset); + w[16] = amd_bytealign (w[11], w[12], offset); + w[15] = amd_bytealign (w[10], w[11], offset); + w[14] = amd_bytealign (w[ 9], w[10], offset); + w[13] = amd_bytealign (w[ 8], w[ 9], offset); + w[12] = amd_bytealign (w[ 7], w[ 8], offset); + w[11] = amd_bytealign (w[ 6], w[ 7], offset); + w[10] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 9] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 8] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 7] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 6] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 5] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 4] = amd_bytealign ( 0, w[ 0], offset); + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 5: + w[63] = amd_bytealign (w[57], w[58], offset); + w[62] = amd_bytealign (w[56], w[57], offset); + w[61] = amd_bytealign (w[55], w[56], offset); + w[60] = amd_bytealign (w[54], w[55], offset); + w[59] = amd_bytealign (w[53], w[54], offset); + w[58] = amd_bytealign (w[52], w[53], offset); + w[57] = amd_bytealign (w[51], w[52], offset); + w[56] = amd_bytealign (w[50], w[51], offset); + w[55] = amd_bytealign (w[49], w[50], offset); + w[54] = amd_bytealign (w[48], w[49], offset); + w[53] = amd_bytealign (w[47], w[48], offset); + w[52] = amd_bytealign (w[46], w[47], offset); + w[51] = amd_bytealign (w[45], w[46], offset); + w[50] = amd_bytealign (w[44], w[45], offset); + w[49] = amd_bytealign (w[43], w[44], offset); + w[48] = amd_bytealign (w[42], w[43], offset); + w[47] = amd_bytealign (w[41], w[42], offset); + w[46] = amd_bytealign (w[40], w[41], offset); + w[45] = amd_bytealign (w[39], w[40], offset); + w[44] = amd_bytealign (w[38], w[39], offset); + w[43] = amd_bytealign (w[37], w[38], offset); + w[42] = amd_bytealign (w[36], w[37], offset); + w[41] = amd_bytealign (w[35], w[36], offset); + w[40] = amd_bytealign (w[34], w[35], offset); + w[39] = amd_bytealign (w[33], w[34], offset); + w[38] = amd_bytealign (w[32], w[33], offset); + w[37] = amd_bytealign (w[31], w[32], offset); + w[36] = amd_bytealign (w[30], w[31], offset); + w[35] = amd_bytealign (w[29], w[30], offset); + w[34] = amd_bytealign (w[28], w[29], offset); + w[33] = amd_bytealign (w[27], w[28], offset); + w[32] = amd_bytealign (w[26], w[27], offset); + w[31] = amd_bytealign (w[25], w[26], offset); + w[30] = amd_bytealign (w[24], w[25], offset); + w[29] = amd_bytealign (w[23], w[24], offset); + w[28] = amd_bytealign (w[22], w[23], offset); + w[27] = amd_bytealign (w[21], w[22], offset); + w[26] = amd_bytealign (w[20], w[21], offset); + w[25] = amd_bytealign (w[19], w[20], offset); + w[24] = amd_bytealign (w[18], w[19], offset); + w[23] = amd_bytealign (w[17], w[18], offset); + w[22] = amd_bytealign (w[16], w[17], offset); + w[21] = amd_bytealign (w[15], w[16], offset); + w[20] = amd_bytealign (w[14], w[15], offset); + w[19] = amd_bytealign (w[13], w[14], offset); + w[18] = amd_bytealign (w[12], w[13], offset); + w[17] = amd_bytealign (w[11], w[12], offset); + w[16] = amd_bytealign (w[10], w[11], offset); + w[15] = amd_bytealign (w[ 9], w[10], offset); + w[14] = amd_bytealign (w[ 8], w[ 9], offset); + w[13] = amd_bytealign (w[ 7], w[ 8], offset); + w[12] = amd_bytealign (w[ 6], w[ 7], offset); + w[11] = amd_bytealign (w[ 5], w[ 6], offset); + w[10] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 9] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 8] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 7] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 6] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 5] = amd_bytealign ( 0, w[ 0], offset); + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 6: + w[63] = amd_bytealign (w[56], w[57], offset); + w[62] = amd_bytealign (w[55], w[56], offset); + w[61] = amd_bytealign (w[54], w[55], offset); + w[60] = amd_bytealign (w[53], w[54], offset); + w[59] = amd_bytealign (w[52], w[53], offset); + w[58] = amd_bytealign (w[51], w[52], offset); + w[57] = amd_bytealign (w[50], w[51], offset); + w[56] = amd_bytealign (w[49], w[50], offset); + w[55] = amd_bytealign (w[48], w[49], offset); + w[54] = amd_bytealign (w[47], w[48], offset); + w[53] = amd_bytealign (w[46], w[47], offset); + w[52] = amd_bytealign (w[45], w[46], offset); + w[51] = amd_bytealign (w[44], w[45], offset); + w[50] = amd_bytealign (w[43], w[44], offset); + w[49] = amd_bytealign (w[42], w[43], offset); + w[48] = amd_bytealign (w[41], w[42], offset); + w[47] = amd_bytealign (w[40], w[41], offset); + w[46] = amd_bytealign (w[39], w[40], offset); + w[45] = amd_bytealign (w[38], w[39], offset); + w[44] = amd_bytealign (w[37], w[38], offset); + w[43] = amd_bytealign (w[36], w[37], offset); + w[42] = amd_bytealign (w[35], w[36], offset); + w[41] = amd_bytealign (w[34], w[35], offset); + w[40] = amd_bytealign (w[33], w[34], offset); + w[39] = amd_bytealign (w[32], w[33], offset); + w[38] = amd_bytealign (w[31], w[32], offset); + w[37] = amd_bytealign (w[30], w[31], offset); + w[36] = amd_bytealign (w[29], w[30], offset); + w[35] = amd_bytealign (w[28], w[29], offset); + w[34] = amd_bytealign (w[27], w[28], offset); + w[33] = amd_bytealign (w[26], w[27], offset); + w[32] = amd_bytealign (w[25], w[26], offset); + w[31] = amd_bytealign (w[24], w[25], offset); + w[30] = amd_bytealign (w[23], w[24], offset); + w[29] = amd_bytealign (w[22], w[23], offset); + w[28] = amd_bytealign (w[21], w[22], offset); + w[27] = amd_bytealign (w[20], w[21], offset); + w[26] = amd_bytealign (w[19], w[20], offset); + w[25] = amd_bytealign (w[18], w[19], offset); + w[24] = amd_bytealign (w[17], w[18], offset); + w[23] = amd_bytealign (w[16], w[17], offset); + w[22] = amd_bytealign (w[15], w[16], offset); + w[21] = amd_bytealign (w[14], w[15], offset); + w[20] = amd_bytealign (w[13], w[14], offset); + w[19] = amd_bytealign (w[12], w[13], offset); + w[18] = amd_bytealign (w[11], w[12], offset); + w[17] = amd_bytealign (w[10], w[11], offset); + w[16] = amd_bytealign (w[ 9], w[10], offset); + w[15] = amd_bytealign (w[ 8], w[ 9], offset); + w[14] = amd_bytealign (w[ 7], w[ 8], offset); + w[13] = amd_bytealign (w[ 6], w[ 7], offset); + w[12] = amd_bytealign (w[ 5], w[ 6], offset); + w[11] = amd_bytealign (w[ 4], w[ 5], offset); + w[10] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 9] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 8] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 7] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 6] = amd_bytealign ( 0, w[ 0], offset); + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 7: + w[63] = amd_bytealign (w[55], w[56], offset); + w[62] = amd_bytealign (w[54], w[55], offset); + w[61] = amd_bytealign (w[53], w[54], offset); + w[60] = amd_bytealign (w[52], w[53], offset); + w[59] = amd_bytealign (w[51], w[52], offset); + w[58] = amd_bytealign (w[50], w[51], offset); + w[57] = amd_bytealign (w[49], w[50], offset); + w[56] = amd_bytealign (w[48], w[49], offset); + w[55] = amd_bytealign (w[47], w[48], offset); + w[54] = amd_bytealign (w[46], w[47], offset); + w[53] = amd_bytealign (w[45], w[46], offset); + w[52] = amd_bytealign (w[44], w[45], offset); + w[51] = amd_bytealign (w[43], w[44], offset); + w[50] = amd_bytealign (w[42], w[43], offset); + w[49] = amd_bytealign (w[41], w[42], offset); + w[48] = amd_bytealign (w[40], w[41], offset); + w[47] = amd_bytealign (w[39], w[40], offset); + w[46] = amd_bytealign (w[38], w[39], offset); + w[45] = amd_bytealign (w[37], w[38], offset); + w[44] = amd_bytealign (w[36], w[37], offset); + w[43] = amd_bytealign (w[35], w[36], offset); + w[42] = amd_bytealign (w[34], w[35], offset); + w[41] = amd_bytealign (w[33], w[34], offset); + w[40] = amd_bytealign (w[32], w[33], offset); + w[39] = amd_bytealign (w[31], w[32], offset); + w[38] = amd_bytealign (w[30], w[31], offset); + w[37] = amd_bytealign (w[29], w[30], offset); + w[36] = amd_bytealign (w[28], w[29], offset); + w[35] = amd_bytealign (w[27], w[28], offset); + w[34] = amd_bytealign (w[26], w[27], offset); + w[33] = amd_bytealign (w[25], w[26], offset); + w[32] = amd_bytealign (w[24], w[25], offset); + w[31] = amd_bytealign (w[23], w[24], offset); + w[30] = amd_bytealign (w[22], w[23], offset); + w[29] = amd_bytealign (w[21], w[22], offset); + w[28] = amd_bytealign (w[20], w[21], offset); + w[27] = amd_bytealign (w[19], w[20], offset); + w[26] = amd_bytealign (w[18], w[19], offset); + w[25] = amd_bytealign (w[17], w[18], offset); + w[24] = amd_bytealign (w[16], w[17], offset); + w[23] = amd_bytealign (w[15], w[16], offset); + w[22] = amd_bytealign (w[14], w[15], offset); + w[21] = amd_bytealign (w[13], w[14], offset); + w[20] = amd_bytealign (w[12], w[13], offset); + w[19] = amd_bytealign (w[11], w[12], offset); + w[18] = amd_bytealign (w[10], w[11], offset); + w[17] = amd_bytealign (w[ 9], w[10], offset); + w[16] = amd_bytealign (w[ 8], w[ 9], offset); + w[15] = amd_bytealign (w[ 7], w[ 8], offset); + w[14] = amd_bytealign (w[ 6], w[ 7], offset); + w[13] = amd_bytealign (w[ 5], w[ 6], offset); + w[12] = amd_bytealign (w[ 4], w[ 5], offset); + w[11] = amd_bytealign (w[ 3], w[ 4], offset); + w[10] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 9] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 8] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 7] = amd_bytealign ( 0, w[ 0], offset); + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 8: + w[63] = amd_bytealign (w[54], w[55], offset); + w[62] = amd_bytealign (w[53], w[54], offset); + w[61] = amd_bytealign (w[52], w[53], offset); + w[60] = amd_bytealign (w[51], w[52], offset); + w[59] = amd_bytealign (w[50], w[51], offset); + w[58] = amd_bytealign (w[49], w[50], offset); + w[57] = amd_bytealign (w[48], w[49], offset); + w[56] = amd_bytealign (w[47], w[48], offset); + w[55] = amd_bytealign (w[46], w[47], offset); + w[54] = amd_bytealign (w[45], w[46], offset); + w[53] = amd_bytealign (w[44], w[45], offset); + w[52] = amd_bytealign (w[43], w[44], offset); + w[51] = amd_bytealign (w[42], w[43], offset); + w[50] = amd_bytealign (w[41], w[42], offset); + w[49] = amd_bytealign (w[40], w[41], offset); + w[48] = amd_bytealign (w[39], w[40], offset); + w[47] = amd_bytealign (w[38], w[39], offset); + w[46] = amd_bytealign (w[37], w[38], offset); + w[45] = amd_bytealign (w[36], w[37], offset); + w[44] = amd_bytealign (w[35], w[36], offset); + w[43] = amd_bytealign (w[34], w[35], offset); + w[42] = amd_bytealign (w[33], w[34], offset); + w[41] = amd_bytealign (w[32], w[33], offset); + w[40] = amd_bytealign (w[31], w[32], offset); + w[39] = amd_bytealign (w[30], w[31], offset); + w[38] = amd_bytealign (w[29], w[30], offset); + w[37] = amd_bytealign (w[28], w[29], offset); + w[36] = amd_bytealign (w[27], w[28], offset); + w[35] = amd_bytealign (w[26], w[27], offset); + w[34] = amd_bytealign (w[25], w[26], offset); + w[33] = amd_bytealign (w[24], w[25], offset); + w[32] = amd_bytealign (w[23], w[24], offset); + w[31] = amd_bytealign (w[22], w[23], offset); + w[30] = amd_bytealign (w[21], w[22], offset); + w[29] = amd_bytealign (w[20], w[21], offset); + w[28] = amd_bytealign (w[19], w[20], offset); + w[27] = amd_bytealign (w[18], w[19], offset); + w[26] = amd_bytealign (w[17], w[18], offset); + w[25] = amd_bytealign (w[16], w[17], offset); + w[24] = amd_bytealign (w[15], w[16], offset); + w[23] = amd_bytealign (w[14], w[15], offset); + w[22] = amd_bytealign (w[13], w[14], offset); + w[21] = amd_bytealign (w[12], w[13], offset); + w[20] = amd_bytealign (w[11], w[12], offset); + w[19] = amd_bytealign (w[10], w[11], offset); + w[18] = amd_bytealign (w[ 9], w[10], offset); + w[17] = amd_bytealign (w[ 8], w[ 9], offset); + w[16] = amd_bytealign (w[ 7], w[ 8], offset); + w[15] = amd_bytealign (w[ 6], w[ 7], offset); + w[14] = amd_bytealign (w[ 5], w[ 6], offset); + w[13] = amd_bytealign (w[ 4], w[ 5], offset); + w[12] = amd_bytealign (w[ 3], w[ 4], offset); + w[11] = amd_bytealign (w[ 2], w[ 3], offset); + w[10] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 9] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 8] = amd_bytealign ( 0, w[ 0], offset); + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 9: + w[63] = amd_bytealign (w[53], w[54], offset); + w[62] = amd_bytealign (w[52], w[53], offset); + w[61] = amd_bytealign (w[51], w[52], offset); + w[60] = amd_bytealign (w[50], w[51], offset); + w[59] = amd_bytealign (w[49], w[50], offset); + w[58] = amd_bytealign (w[48], w[49], offset); + w[57] = amd_bytealign (w[47], w[48], offset); + w[56] = amd_bytealign (w[46], w[47], offset); + w[55] = amd_bytealign (w[45], w[46], offset); + w[54] = amd_bytealign (w[44], w[45], offset); + w[53] = amd_bytealign (w[43], w[44], offset); + w[52] = amd_bytealign (w[42], w[43], offset); + w[51] = amd_bytealign (w[41], w[42], offset); + w[50] = amd_bytealign (w[40], w[41], offset); + w[49] = amd_bytealign (w[39], w[40], offset); + w[48] = amd_bytealign (w[38], w[39], offset); + w[47] = amd_bytealign (w[37], w[38], offset); + w[46] = amd_bytealign (w[36], w[37], offset); + w[45] = amd_bytealign (w[35], w[36], offset); + w[44] = amd_bytealign (w[34], w[35], offset); + w[43] = amd_bytealign (w[33], w[34], offset); + w[42] = amd_bytealign (w[32], w[33], offset); + w[41] = amd_bytealign (w[31], w[32], offset); + w[40] = amd_bytealign (w[30], w[31], offset); + w[39] = amd_bytealign (w[29], w[30], offset); + w[38] = amd_bytealign (w[28], w[29], offset); + w[37] = amd_bytealign (w[27], w[28], offset); + w[36] = amd_bytealign (w[26], w[27], offset); + w[35] = amd_bytealign (w[25], w[26], offset); + w[34] = amd_bytealign (w[24], w[25], offset); + w[33] = amd_bytealign (w[23], w[24], offset); + w[32] = amd_bytealign (w[22], w[23], offset); + w[31] = amd_bytealign (w[21], w[22], offset); + w[30] = amd_bytealign (w[20], w[21], offset); + w[29] = amd_bytealign (w[19], w[20], offset); + w[28] = amd_bytealign (w[18], w[19], offset); + w[27] = amd_bytealign (w[17], w[18], offset); + w[26] = amd_bytealign (w[16], w[17], offset); + w[25] = amd_bytealign (w[15], w[16], offset); + w[24] = amd_bytealign (w[14], w[15], offset); + w[23] = amd_bytealign (w[13], w[14], offset); + w[22] = amd_bytealign (w[12], w[13], offset); + w[21] = amd_bytealign (w[11], w[12], offset); + w[20] = amd_bytealign (w[10], w[11], offset); + w[19] = amd_bytealign (w[ 9], w[10], offset); + w[18] = amd_bytealign (w[ 8], w[ 9], offset); + w[17] = amd_bytealign (w[ 7], w[ 8], offset); + w[16] = amd_bytealign (w[ 6], w[ 7], offset); + w[15] = amd_bytealign (w[ 5], w[ 6], offset); + w[14] = amd_bytealign (w[ 4], w[ 5], offset); + w[13] = amd_bytealign (w[ 3], w[ 4], offset); + w[12] = amd_bytealign (w[ 2], w[ 3], offset); + w[11] = amd_bytealign (w[ 1], w[ 2], offset); + w[10] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 9] = amd_bytealign ( 0, w[ 0], offset); + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 10: + w[63] = amd_bytealign (w[52], w[53], offset); + w[62] = amd_bytealign (w[51], w[52], offset); + w[61] = amd_bytealign (w[50], w[51], offset); + w[60] = amd_bytealign (w[49], w[50], offset); + w[59] = amd_bytealign (w[48], w[49], offset); + w[58] = amd_bytealign (w[47], w[48], offset); + w[57] = amd_bytealign (w[46], w[47], offset); + w[56] = amd_bytealign (w[45], w[46], offset); + w[55] = amd_bytealign (w[44], w[45], offset); + w[54] = amd_bytealign (w[43], w[44], offset); + w[53] = amd_bytealign (w[42], w[43], offset); + w[52] = amd_bytealign (w[41], w[42], offset); + w[51] = amd_bytealign (w[40], w[41], offset); + w[50] = amd_bytealign (w[39], w[40], offset); + w[49] = amd_bytealign (w[38], w[39], offset); + w[48] = amd_bytealign (w[37], w[38], offset); + w[47] = amd_bytealign (w[36], w[37], offset); + w[46] = amd_bytealign (w[35], w[36], offset); + w[45] = amd_bytealign (w[34], w[35], offset); + w[44] = amd_bytealign (w[33], w[34], offset); + w[43] = amd_bytealign (w[32], w[33], offset); + w[42] = amd_bytealign (w[31], w[32], offset); + w[41] = amd_bytealign (w[30], w[31], offset); + w[40] = amd_bytealign (w[29], w[30], offset); + w[39] = amd_bytealign (w[28], w[29], offset); + w[38] = amd_bytealign (w[27], w[28], offset); + w[37] = amd_bytealign (w[26], w[27], offset); + w[36] = amd_bytealign (w[25], w[26], offset); + w[35] = amd_bytealign (w[24], w[25], offset); + w[34] = amd_bytealign (w[23], w[24], offset); + w[33] = amd_bytealign (w[22], w[23], offset); + w[32] = amd_bytealign (w[21], w[22], offset); + w[31] = amd_bytealign (w[20], w[21], offset); + w[30] = amd_bytealign (w[19], w[20], offset); + w[29] = amd_bytealign (w[18], w[19], offset); + w[28] = amd_bytealign (w[17], w[18], offset); + w[27] = amd_bytealign (w[16], w[17], offset); + w[26] = amd_bytealign (w[15], w[16], offset); + w[25] = amd_bytealign (w[14], w[15], offset); + w[24] = amd_bytealign (w[13], w[14], offset); + w[23] = amd_bytealign (w[12], w[13], offset); + w[22] = amd_bytealign (w[11], w[12], offset); + w[21] = amd_bytealign (w[10], w[11], offset); + w[20] = amd_bytealign (w[ 9], w[10], offset); + w[19] = amd_bytealign (w[ 8], w[ 9], offset); + w[18] = amd_bytealign (w[ 7], w[ 8], offset); + w[17] = amd_bytealign (w[ 6], w[ 7], offset); + w[16] = amd_bytealign (w[ 5], w[ 6], offset); + w[15] = amd_bytealign (w[ 4], w[ 5], offset); + w[14] = amd_bytealign (w[ 3], w[ 4], offset); + w[13] = amd_bytealign (w[ 2], w[ 3], offset); + w[12] = amd_bytealign (w[ 1], w[ 2], offset); + w[11] = amd_bytealign (w[ 0], w[ 1], offset); + w[10] = amd_bytealign ( 0, w[ 0], offset); + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 11: + w[63] = amd_bytealign (w[51], w[52], offset); + w[62] = amd_bytealign (w[50], w[51], offset); + w[61] = amd_bytealign (w[49], w[50], offset); + w[60] = amd_bytealign (w[48], w[49], offset); + w[59] = amd_bytealign (w[47], w[48], offset); + w[58] = amd_bytealign (w[46], w[47], offset); + w[57] = amd_bytealign (w[45], w[46], offset); + w[56] = amd_bytealign (w[44], w[45], offset); + w[55] = amd_bytealign (w[43], w[44], offset); + w[54] = amd_bytealign (w[42], w[43], offset); + w[53] = amd_bytealign (w[41], w[42], offset); + w[52] = amd_bytealign (w[40], w[41], offset); + w[51] = amd_bytealign (w[39], w[40], offset); + w[50] = amd_bytealign (w[38], w[39], offset); + w[49] = amd_bytealign (w[37], w[38], offset); + w[48] = amd_bytealign (w[36], w[37], offset); + w[47] = amd_bytealign (w[35], w[36], offset); + w[46] = amd_bytealign (w[34], w[35], offset); + w[45] = amd_bytealign (w[33], w[34], offset); + w[44] = amd_bytealign (w[32], w[33], offset); + w[43] = amd_bytealign (w[31], w[32], offset); + w[42] = amd_bytealign (w[30], w[31], offset); + w[41] = amd_bytealign (w[29], w[30], offset); + w[40] = amd_bytealign (w[28], w[29], offset); + w[39] = amd_bytealign (w[27], w[28], offset); + w[38] = amd_bytealign (w[26], w[27], offset); + w[37] = amd_bytealign (w[25], w[26], offset); + w[36] = amd_bytealign (w[24], w[25], offset); + w[35] = amd_bytealign (w[23], w[24], offset); + w[34] = amd_bytealign (w[22], w[23], offset); + w[33] = amd_bytealign (w[21], w[22], offset); + w[32] = amd_bytealign (w[20], w[21], offset); + w[31] = amd_bytealign (w[19], w[20], offset); + w[30] = amd_bytealign (w[18], w[19], offset); + w[29] = amd_bytealign (w[17], w[18], offset); + w[28] = amd_bytealign (w[16], w[17], offset); + w[27] = amd_bytealign (w[15], w[16], offset); + w[26] = amd_bytealign (w[14], w[15], offset); + w[25] = amd_bytealign (w[13], w[14], offset); + w[24] = amd_bytealign (w[12], w[13], offset); + w[23] = amd_bytealign (w[11], w[12], offset); + w[22] = amd_bytealign (w[10], w[11], offset); + w[21] = amd_bytealign (w[ 9], w[10], offset); + w[20] = amd_bytealign (w[ 8], w[ 9], offset); + w[19] = amd_bytealign (w[ 7], w[ 8], offset); + w[18] = amd_bytealign (w[ 6], w[ 7], offset); + w[17] = amd_bytealign (w[ 5], w[ 6], offset); + w[16] = amd_bytealign (w[ 4], w[ 5], offset); + w[15] = amd_bytealign (w[ 3], w[ 4], offset); + w[14] = amd_bytealign (w[ 2], w[ 3], offset); + w[13] = amd_bytealign (w[ 1], w[ 2], offset); + w[12] = amd_bytealign (w[ 0], w[ 1], offset); + w[11] = amd_bytealign ( 0, w[ 0], offset); + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 12: + w[63] = amd_bytealign (w[50], w[51], offset); + w[62] = amd_bytealign (w[49], w[50], offset); + w[61] = amd_bytealign (w[48], w[49], offset); + w[60] = amd_bytealign (w[47], w[48], offset); + w[59] = amd_bytealign (w[46], w[47], offset); + w[58] = amd_bytealign (w[45], w[46], offset); + w[57] = amd_bytealign (w[44], w[45], offset); + w[56] = amd_bytealign (w[43], w[44], offset); + w[55] = amd_bytealign (w[42], w[43], offset); + w[54] = amd_bytealign (w[41], w[42], offset); + w[53] = amd_bytealign (w[40], w[41], offset); + w[52] = amd_bytealign (w[39], w[40], offset); + w[51] = amd_bytealign (w[38], w[39], offset); + w[50] = amd_bytealign (w[37], w[38], offset); + w[49] = amd_bytealign (w[36], w[37], offset); + w[48] = amd_bytealign (w[35], w[36], offset); + w[47] = amd_bytealign (w[34], w[35], offset); + w[46] = amd_bytealign (w[33], w[34], offset); + w[45] = amd_bytealign (w[32], w[33], offset); + w[44] = amd_bytealign (w[31], w[32], offset); + w[43] = amd_bytealign (w[30], w[31], offset); + w[42] = amd_bytealign (w[29], w[30], offset); + w[41] = amd_bytealign (w[28], w[29], offset); + w[40] = amd_bytealign (w[27], w[28], offset); + w[39] = amd_bytealign (w[26], w[27], offset); + w[38] = amd_bytealign (w[25], w[26], offset); + w[37] = amd_bytealign (w[24], w[25], offset); + w[36] = amd_bytealign (w[23], w[24], offset); + w[35] = amd_bytealign (w[22], w[23], offset); + w[34] = amd_bytealign (w[21], w[22], offset); + w[33] = amd_bytealign (w[20], w[21], offset); + w[32] = amd_bytealign (w[19], w[20], offset); + w[31] = amd_bytealign (w[18], w[19], offset); + w[30] = amd_bytealign (w[17], w[18], offset); + w[29] = amd_bytealign (w[16], w[17], offset); + w[28] = amd_bytealign (w[15], w[16], offset); + w[27] = amd_bytealign (w[14], w[15], offset); + w[26] = amd_bytealign (w[13], w[14], offset); + w[25] = amd_bytealign (w[12], w[13], offset); + w[24] = amd_bytealign (w[11], w[12], offset); + w[23] = amd_bytealign (w[10], w[11], offset); + w[22] = amd_bytealign (w[ 9], w[10], offset); + w[21] = amd_bytealign (w[ 8], w[ 9], offset); + w[20] = amd_bytealign (w[ 7], w[ 8], offset); + w[19] = amd_bytealign (w[ 6], w[ 7], offset); + w[18] = amd_bytealign (w[ 5], w[ 6], offset); + w[17] = amd_bytealign (w[ 4], w[ 5], offset); + w[16] = amd_bytealign (w[ 3], w[ 4], offset); + w[15] = amd_bytealign (w[ 2], w[ 3], offset); + w[14] = amd_bytealign (w[ 1], w[ 2], offset); + w[13] = amd_bytealign (w[ 0], w[ 1], offset); + w[12] = amd_bytealign ( 0, w[ 0], offset); + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 13: + w[63] = amd_bytealign (w[49], w[50], offset); + w[62] = amd_bytealign (w[48], w[49], offset); + w[61] = amd_bytealign (w[47], w[48], offset); + w[60] = amd_bytealign (w[46], w[47], offset); + w[59] = amd_bytealign (w[45], w[46], offset); + w[58] = amd_bytealign (w[44], w[45], offset); + w[57] = amd_bytealign (w[43], w[44], offset); + w[56] = amd_bytealign (w[42], w[43], offset); + w[55] = amd_bytealign (w[41], w[42], offset); + w[54] = amd_bytealign (w[40], w[41], offset); + w[53] = amd_bytealign (w[39], w[40], offset); + w[52] = amd_bytealign (w[38], w[39], offset); + w[51] = amd_bytealign (w[37], w[38], offset); + w[50] = amd_bytealign (w[36], w[37], offset); + w[49] = amd_bytealign (w[35], w[36], offset); + w[48] = amd_bytealign (w[34], w[35], offset); + w[47] = amd_bytealign (w[33], w[34], offset); + w[46] = amd_bytealign (w[32], w[33], offset); + w[45] = amd_bytealign (w[31], w[32], offset); + w[44] = amd_bytealign (w[30], w[31], offset); + w[43] = amd_bytealign (w[29], w[30], offset); + w[42] = amd_bytealign (w[28], w[29], offset); + w[41] = amd_bytealign (w[27], w[28], offset); + w[40] = amd_bytealign (w[26], w[27], offset); + w[39] = amd_bytealign (w[25], w[26], offset); + w[38] = amd_bytealign (w[24], w[25], offset); + w[37] = amd_bytealign (w[23], w[24], offset); + w[36] = amd_bytealign (w[22], w[23], offset); + w[35] = amd_bytealign (w[21], w[22], offset); + w[34] = amd_bytealign (w[20], w[21], offset); + w[33] = amd_bytealign (w[19], w[20], offset); + w[32] = amd_bytealign (w[18], w[19], offset); + w[31] = amd_bytealign (w[17], w[18], offset); + w[30] = amd_bytealign (w[16], w[17], offset); + w[29] = amd_bytealign (w[15], w[16], offset); + w[28] = amd_bytealign (w[14], w[15], offset); + w[27] = amd_bytealign (w[13], w[14], offset); + w[26] = amd_bytealign (w[12], w[13], offset); + w[25] = amd_bytealign (w[11], w[12], offset); + w[24] = amd_bytealign (w[10], w[11], offset); + w[23] = amd_bytealign (w[ 9], w[10], offset); + w[22] = amd_bytealign (w[ 8], w[ 9], offset); + w[21] = amd_bytealign (w[ 7], w[ 8], offset); + w[20] = amd_bytealign (w[ 6], w[ 7], offset); + w[19] = amd_bytealign (w[ 5], w[ 6], offset); + w[18] = amd_bytealign (w[ 4], w[ 5], offset); + w[17] = amd_bytealign (w[ 3], w[ 4], offset); + w[16] = amd_bytealign (w[ 2], w[ 3], offset); + w[15] = amd_bytealign (w[ 1], w[ 2], offset); + w[14] = amd_bytealign (w[ 0], w[ 1], offset); + w[13] = amd_bytealign ( 0, w[ 0], offset); + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 14: + w[63] = amd_bytealign (w[48], w[49], offset); + w[62] = amd_bytealign (w[47], w[48], offset); + w[61] = amd_bytealign (w[46], w[47], offset); + w[60] = amd_bytealign (w[45], w[46], offset); + w[59] = amd_bytealign (w[44], w[45], offset); + w[58] = amd_bytealign (w[43], w[44], offset); + w[57] = amd_bytealign (w[42], w[43], offset); + w[56] = amd_bytealign (w[41], w[42], offset); + w[55] = amd_bytealign (w[40], w[41], offset); + w[54] = amd_bytealign (w[39], w[40], offset); + w[53] = amd_bytealign (w[38], w[39], offset); + w[52] = amd_bytealign (w[37], w[38], offset); + w[51] = amd_bytealign (w[36], w[37], offset); + w[50] = amd_bytealign (w[35], w[36], offset); + w[49] = amd_bytealign (w[34], w[35], offset); + w[48] = amd_bytealign (w[33], w[34], offset); + w[47] = amd_bytealign (w[32], w[33], offset); + w[46] = amd_bytealign (w[31], w[32], offset); + w[45] = amd_bytealign (w[30], w[31], offset); + w[44] = amd_bytealign (w[29], w[30], offset); + w[43] = amd_bytealign (w[28], w[29], offset); + w[42] = amd_bytealign (w[27], w[28], offset); + w[41] = amd_bytealign (w[26], w[27], offset); + w[40] = amd_bytealign (w[25], w[26], offset); + w[39] = amd_bytealign (w[24], w[25], offset); + w[38] = amd_bytealign (w[23], w[24], offset); + w[37] = amd_bytealign (w[22], w[23], offset); + w[36] = amd_bytealign (w[21], w[22], offset); + w[35] = amd_bytealign (w[20], w[21], offset); + w[34] = amd_bytealign (w[19], w[20], offset); + w[33] = amd_bytealign (w[18], w[19], offset); + w[32] = amd_bytealign (w[17], w[18], offset); + w[31] = amd_bytealign (w[16], w[17], offset); + w[30] = amd_bytealign (w[15], w[16], offset); + w[29] = amd_bytealign (w[14], w[15], offset); + w[28] = amd_bytealign (w[13], w[14], offset); + w[27] = amd_bytealign (w[12], w[13], offset); + w[26] = amd_bytealign (w[11], w[12], offset); + w[25] = amd_bytealign (w[10], w[11], offset); + w[24] = amd_bytealign (w[ 9], w[10], offset); + w[23] = amd_bytealign (w[ 8], w[ 9], offset); + w[22] = amd_bytealign (w[ 7], w[ 8], offset); + w[21] = amd_bytealign (w[ 6], w[ 7], offset); + w[20] = amd_bytealign (w[ 5], w[ 6], offset); + w[19] = amd_bytealign (w[ 4], w[ 5], offset); + w[18] = amd_bytealign (w[ 3], w[ 4], offset); + w[17] = amd_bytealign (w[ 2], w[ 3], offset); + w[16] = amd_bytealign (w[ 1], w[ 2], offset); + w[15] = amd_bytealign (w[ 0], w[ 1], offset); + w[14] = amd_bytealign ( 0, w[ 0], offset); + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 15: + w[63] = amd_bytealign (w[47], w[48], offset); + w[62] = amd_bytealign (w[46], w[47], offset); + w[61] = amd_bytealign (w[45], w[46], offset); + w[60] = amd_bytealign (w[44], w[45], offset); + w[59] = amd_bytealign (w[43], w[44], offset); + w[58] = amd_bytealign (w[42], w[43], offset); + w[57] = amd_bytealign (w[41], w[42], offset); + w[56] = amd_bytealign (w[40], w[41], offset); + w[55] = amd_bytealign (w[39], w[40], offset); + w[54] = amd_bytealign (w[38], w[39], offset); + w[53] = amd_bytealign (w[37], w[38], offset); + w[52] = amd_bytealign (w[36], w[37], offset); + w[51] = amd_bytealign (w[35], w[36], offset); + w[50] = amd_bytealign (w[34], w[35], offset); + w[49] = amd_bytealign (w[33], w[34], offset); + w[48] = amd_bytealign (w[32], w[33], offset); + w[47] = amd_bytealign (w[31], w[32], offset); + w[46] = amd_bytealign (w[30], w[31], offset); + w[45] = amd_bytealign (w[29], w[30], offset); + w[44] = amd_bytealign (w[28], w[29], offset); + w[43] = amd_bytealign (w[27], w[28], offset); + w[42] = amd_bytealign (w[26], w[27], offset); + w[41] = amd_bytealign (w[25], w[26], offset); + w[40] = amd_bytealign (w[24], w[25], offset); + w[39] = amd_bytealign (w[23], w[24], offset); + w[38] = amd_bytealign (w[22], w[23], offset); + w[37] = amd_bytealign (w[21], w[22], offset); + w[36] = amd_bytealign (w[20], w[21], offset); + w[35] = amd_bytealign (w[19], w[20], offset); + w[34] = amd_bytealign (w[18], w[19], offset); + w[33] = amd_bytealign (w[17], w[18], offset); + w[32] = amd_bytealign (w[16], w[17], offset); + w[31] = amd_bytealign (w[15], w[16], offset); + w[30] = amd_bytealign (w[14], w[15], offset); + w[29] = amd_bytealign (w[13], w[14], offset); + w[28] = amd_bytealign (w[12], w[13], offset); + w[27] = amd_bytealign (w[11], w[12], offset); + w[26] = amd_bytealign (w[10], w[11], offset); + w[25] = amd_bytealign (w[ 9], w[10], offset); + w[24] = amd_bytealign (w[ 8], w[ 9], offset); + w[23] = amd_bytealign (w[ 7], w[ 8], offset); + w[22] = amd_bytealign (w[ 6], w[ 7], offset); + w[21] = amd_bytealign (w[ 5], w[ 6], offset); + w[20] = amd_bytealign (w[ 4], w[ 5], offset); + w[19] = amd_bytealign (w[ 3], w[ 4], offset); + w[18] = amd_bytealign (w[ 2], w[ 3], offset); + w[17] = amd_bytealign (w[ 1], w[ 2], offset); + w[16] = amd_bytealign (w[ 0], w[ 1], offset); + w[15] = amd_bytealign ( 0, w[ 0], offset); + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 16: + w[63] = amd_bytealign (w[46], w[47], offset); + w[62] = amd_bytealign (w[45], w[46], offset); + w[61] = amd_bytealign (w[44], w[45], offset); + w[60] = amd_bytealign (w[43], w[44], offset); + w[59] = amd_bytealign (w[42], w[43], offset); + w[58] = amd_bytealign (w[41], w[42], offset); + w[57] = amd_bytealign (w[40], w[41], offset); + w[56] = amd_bytealign (w[39], w[40], offset); + w[55] = amd_bytealign (w[38], w[39], offset); + w[54] = amd_bytealign (w[37], w[38], offset); + w[53] = amd_bytealign (w[36], w[37], offset); + w[52] = amd_bytealign (w[35], w[36], offset); + w[51] = amd_bytealign (w[34], w[35], offset); + w[50] = amd_bytealign (w[33], w[34], offset); + w[49] = amd_bytealign (w[32], w[33], offset); + w[48] = amd_bytealign (w[31], w[32], offset); + w[47] = amd_bytealign (w[30], w[31], offset); + w[46] = amd_bytealign (w[29], w[30], offset); + w[45] = amd_bytealign (w[28], w[29], offset); + w[44] = amd_bytealign (w[27], w[28], offset); + w[43] = amd_bytealign (w[26], w[27], offset); + w[42] = amd_bytealign (w[25], w[26], offset); + w[41] = amd_bytealign (w[24], w[25], offset); + w[40] = amd_bytealign (w[23], w[24], offset); + w[39] = amd_bytealign (w[22], w[23], offset); + w[38] = amd_bytealign (w[21], w[22], offset); + w[37] = amd_bytealign (w[20], w[21], offset); + w[36] = amd_bytealign (w[19], w[20], offset); + w[35] = amd_bytealign (w[18], w[19], offset); + w[34] = amd_bytealign (w[17], w[18], offset); + w[33] = amd_bytealign (w[16], w[17], offset); + w[32] = amd_bytealign (w[15], w[16], offset); + w[31] = amd_bytealign (w[14], w[15], offset); + w[30] = amd_bytealign (w[13], w[14], offset); + w[29] = amd_bytealign (w[12], w[13], offset); + w[28] = amd_bytealign (w[11], w[12], offset); + w[27] = amd_bytealign (w[10], w[11], offset); + w[26] = amd_bytealign (w[ 9], w[10], offset); + w[25] = amd_bytealign (w[ 8], w[ 9], offset); + w[24] = amd_bytealign (w[ 7], w[ 8], offset); + w[23] = amd_bytealign (w[ 6], w[ 7], offset); + w[22] = amd_bytealign (w[ 5], w[ 6], offset); + w[21] = amd_bytealign (w[ 4], w[ 5], offset); + w[20] = amd_bytealign (w[ 3], w[ 4], offset); + w[19] = amd_bytealign (w[ 2], w[ 3], offset); + w[18] = amd_bytealign (w[ 1], w[ 2], offset); + w[17] = amd_bytealign (w[ 0], w[ 1], offset); + w[16] = amd_bytealign ( 0, w[ 0], offset); + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 17: + w[63] = amd_bytealign (w[45], w[46], offset); + w[62] = amd_bytealign (w[44], w[45], offset); + w[61] = amd_bytealign (w[43], w[44], offset); + w[60] = amd_bytealign (w[42], w[43], offset); + w[59] = amd_bytealign (w[41], w[42], offset); + w[58] = amd_bytealign (w[40], w[41], offset); + w[57] = amd_bytealign (w[39], w[40], offset); + w[56] = amd_bytealign (w[38], w[39], offset); + w[55] = amd_bytealign (w[37], w[38], offset); + w[54] = amd_bytealign (w[36], w[37], offset); + w[53] = amd_bytealign (w[35], w[36], offset); + w[52] = amd_bytealign (w[34], w[35], offset); + w[51] = amd_bytealign (w[33], w[34], offset); + w[50] = amd_bytealign (w[32], w[33], offset); + w[49] = amd_bytealign (w[31], w[32], offset); + w[48] = amd_bytealign (w[30], w[31], offset); + w[47] = amd_bytealign (w[29], w[30], offset); + w[46] = amd_bytealign (w[28], w[29], offset); + w[45] = amd_bytealign (w[27], w[28], offset); + w[44] = amd_bytealign (w[26], w[27], offset); + w[43] = amd_bytealign (w[25], w[26], offset); + w[42] = amd_bytealign (w[24], w[25], offset); + w[41] = amd_bytealign (w[23], w[24], offset); + w[40] = amd_bytealign (w[22], w[23], offset); + w[39] = amd_bytealign (w[21], w[22], offset); + w[38] = amd_bytealign (w[20], w[21], offset); + w[37] = amd_bytealign (w[19], w[20], offset); + w[36] = amd_bytealign (w[18], w[19], offset); + w[35] = amd_bytealign (w[17], w[18], offset); + w[34] = amd_bytealign (w[16], w[17], offset); + w[33] = amd_bytealign (w[15], w[16], offset); + w[32] = amd_bytealign (w[14], w[15], offset); + w[31] = amd_bytealign (w[13], w[14], offset); + w[30] = amd_bytealign (w[12], w[13], offset); + w[29] = amd_bytealign (w[11], w[12], offset); + w[28] = amd_bytealign (w[10], w[11], offset); + w[27] = amd_bytealign (w[ 9], w[10], offset); + w[26] = amd_bytealign (w[ 8], w[ 9], offset); + w[25] = amd_bytealign (w[ 7], w[ 8], offset); + w[24] = amd_bytealign (w[ 6], w[ 7], offset); + w[23] = amd_bytealign (w[ 5], w[ 6], offset); + w[22] = amd_bytealign (w[ 4], w[ 5], offset); + w[21] = amd_bytealign (w[ 3], w[ 4], offset); + w[20] = amd_bytealign (w[ 2], w[ 3], offset); + w[19] = amd_bytealign (w[ 1], w[ 2], offset); + w[18] = amd_bytealign (w[ 0], w[ 1], offset); + w[17] = amd_bytealign ( 0, w[ 0], offset); + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 18: + w[63] = amd_bytealign (w[44], w[45], offset); + w[62] = amd_bytealign (w[43], w[44], offset); + w[61] = amd_bytealign (w[42], w[43], offset); + w[60] = amd_bytealign (w[41], w[42], offset); + w[59] = amd_bytealign (w[40], w[41], offset); + w[58] = amd_bytealign (w[39], w[40], offset); + w[57] = amd_bytealign (w[38], w[39], offset); + w[56] = amd_bytealign (w[37], w[38], offset); + w[55] = amd_bytealign (w[36], w[37], offset); + w[54] = amd_bytealign (w[35], w[36], offset); + w[53] = amd_bytealign (w[34], w[35], offset); + w[52] = amd_bytealign (w[33], w[34], offset); + w[51] = amd_bytealign (w[32], w[33], offset); + w[50] = amd_bytealign (w[31], w[32], offset); + w[49] = amd_bytealign (w[30], w[31], offset); + w[48] = amd_bytealign (w[29], w[30], offset); + w[47] = amd_bytealign (w[28], w[29], offset); + w[46] = amd_bytealign (w[27], w[28], offset); + w[45] = amd_bytealign (w[26], w[27], offset); + w[44] = amd_bytealign (w[25], w[26], offset); + w[43] = amd_bytealign (w[24], w[25], offset); + w[42] = amd_bytealign (w[23], w[24], offset); + w[41] = amd_bytealign (w[22], w[23], offset); + w[40] = amd_bytealign (w[21], w[22], offset); + w[39] = amd_bytealign (w[20], w[21], offset); + w[38] = amd_bytealign (w[19], w[20], offset); + w[37] = amd_bytealign (w[18], w[19], offset); + w[36] = amd_bytealign (w[17], w[18], offset); + w[35] = amd_bytealign (w[16], w[17], offset); + w[34] = amd_bytealign (w[15], w[16], offset); + w[33] = amd_bytealign (w[14], w[15], offset); + w[32] = amd_bytealign (w[13], w[14], offset); + w[31] = amd_bytealign (w[12], w[13], offset); + w[30] = amd_bytealign (w[11], w[12], offset); + w[29] = amd_bytealign (w[10], w[11], offset); + w[28] = amd_bytealign (w[ 9], w[10], offset); + w[27] = amd_bytealign (w[ 8], w[ 9], offset); + w[26] = amd_bytealign (w[ 7], w[ 8], offset); + w[25] = amd_bytealign (w[ 6], w[ 7], offset); + w[24] = amd_bytealign (w[ 5], w[ 6], offset); + w[23] = amd_bytealign (w[ 4], w[ 5], offset); + w[22] = amd_bytealign (w[ 3], w[ 4], offset); + w[21] = amd_bytealign (w[ 2], w[ 3], offset); + w[20] = amd_bytealign (w[ 1], w[ 2], offset); + w[19] = amd_bytealign (w[ 0], w[ 1], offset); + w[18] = amd_bytealign ( 0, w[ 0], offset); + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 19: + w[63] = amd_bytealign (w[43], w[44], offset); + w[62] = amd_bytealign (w[42], w[43], offset); + w[61] = amd_bytealign (w[41], w[42], offset); + w[60] = amd_bytealign (w[40], w[41], offset); + w[59] = amd_bytealign (w[39], w[40], offset); + w[58] = amd_bytealign (w[38], w[39], offset); + w[57] = amd_bytealign (w[37], w[38], offset); + w[56] = amd_bytealign (w[36], w[37], offset); + w[55] = amd_bytealign (w[35], w[36], offset); + w[54] = amd_bytealign (w[34], w[35], offset); + w[53] = amd_bytealign (w[33], w[34], offset); + w[52] = amd_bytealign (w[32], w[33], offset); + w[51] = amd_bytealign (w[31], w[32], offset); + w[50] = amd_bytealign (w[30], w[31], offset); + w[49] = amd_bytealign (w[29], w[30], offset); + w[48] = amd_bytealign (w[28], w[29], offset); + w[47] = amd_bytealign (w[27], w[28], offset); + w[46] = amd_bytealign (w[26], w[27], offset); + w[45] = amd_bytealign (w[25], w[26], offset); + w[44] = amd_bytealign (w[24], w[25], offset); + w[43] = amd_bytealign (w[23], w[24], offset); + w[42] = amd_bytealign (w[22], w[23], offset); + w[41] = amd_bytealign (w[21], w[22], offset); + w[40] = amd_bytealign (w[20], w[21], offset); + w[39] = amd_bytealign (w[19], w[20], offset); + w[38] = amd_bytealign (w[18], w[19], offset); + w[37] = amd_bytealign (w[17], w[18], offset); + w[36] = amd_bytealign (w[16], w[17], offset); + w[35] = amd_bytealign (w[15], w[16], offset); + w[34] = amd_bytealign (w[14], w[15], offset); + w[33] = amd_bytealign (w[13], w[14], offset); + w[32] = amd_bytealign (w[12], w[13], offset); + w[31] = amd_bytealign (w[11], w[12], offset); + w[30] = amd_bytealign (w[10], w[11], offset); + w[29] = amd_bytealign (w[ 9], w[10], offset); + w[28] = amd_bytealign (w[ 8], w[ 9], offset); + w[27] = amd_bytealign (w[ 7], w[ 8], offset); + w[26] = amd_bytealign (w[ 6], w[ 7], offset); + w[25] = amd_bytealign (w[ 5], w[ 6], offset); + w[24] = amd_bytealign (w[ 4], w[ 5], offset); + w[23] = amd_bytealign (w[ 3], w[ 4], offset); + w[22] = amd_bytealign (w[ 2], w[ 3], offset); + w[21] = amd_bytealign (w[ 1], w[ 2], offset); + w[20] = amd_bytealign (w[ 0], w[ 1], offset); + w[19] = amd_bytealign ( 0, w[ 0], offset); + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 20: + w[63] = amd_bytealign (w[42], w[43], offset); + w[62] = amd_bytealign (w[41], w[42], offset); + w[61] = amd_bytealign (w[40], w[41], offset); + w[60] = amd_bytealign (w[39], w[40], offset); + w[59] = amd_bytealign (w[38], w[39], offset); + w[58] = amd_bytealign (w[37], w[38], offset); + w[57] = amd_bytealign (w[36], w[37], offset); + w[56] = amd_bytealign (w[35], w[36], offset); + w[55] = amd_bytealign (w[34], w[35], offset); + w[54] = amd_bytealign (w[33], w[34], offset); + w[53] = amd_bytealign (w[32], w[33], offset); + w[52] = amd_bytealign (w[31], w[32], offset); + w[51] = amd_bytealign (w[30], w[31], offset); + w[50] = amd_bytealign (w[29], w[30], offset); + w[49] = amd_bytealign (w[28], w[29], offset); + w[48] = amd_bytealign (w[27], w[28], offset); + w[47] = amd_bytealign (w[26], w[27], offset); + w[46] = amd_bytealign (w[25], w[26], offset); + w[45] = amd_bytealign (w[24], w[25], offset); + w[44] = amd_bytealign (w[23], w[24], offset); + w[43] = amd_bytealign (w[22], w[23], offset); + w[42] = amd_bytealign (w[21], w[22], offset); + w[41] = amd_bytealign (w[20], w[21], offset); + w[40] = amd_bytealign (w[19], w[20], offset); + w[39] = amd_bytealign (w[18], w[19], offset); + w[38] = amd_bytealign (w[17], w[18], offset); + w[37] = amd_bytealign (w[16], w[17], offset); + w[36] = amd_bytealign (w[15], w[16], offset); + w[35] = amd_bytealign (w[14], w[15], offset); + w[34] = amd_bytealign (w[13], w[14], offset); + w[33] = amd_bytealign (w[12], w[13], offset); + w[32] = amd_bytealign (w[11], w[12], offset); + w[31] = amd_bytealign (w[10], w[11], offset); + w[30] = amd_bytealign (w[ 9], w[10], offset); + w[29] = amd_bytealign (w[ 8], w[ 9], offset); + w[28] = amd_bytealign (w[ 7], w[ 8], offset); + w[27] = amd_bytealign (w[ 6], w[ 7], offset); + w[26] = amd_bytealign (w[ 5], w[ 6], offset); + w[25] = amd_bytealign (w[ 4], w[ 5], offset); + w[24] = amd_bytealign (w[ 3], w[ 4], offset); + w[23] = amd_bytealign (w[ 2], w[ 3], offset); + w[22] = amd_bytealign (w[ 1], w[ 2], offset); + w[21] = amd_bytealign (w[ 0], w[ 1], offset); + w[20] = amd_bytealign ( 0, w[ 0], offset); + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 21: + w[63] = amd_bytealign (w[41], w[42], offset); + w[62] = amd_bytealign (w[40], w[41], offset); + w[61] = amd_bytealign (w[39], w[40], offset); + w[60] = amd_bytealign (w[38], w[39], offset); + w[59] = amd_bytealign (w[37], w[38], offset); + w[58] = amd_bytealign (w[36], w[37], offset); + w[57] = amd_bytealign (w[35], w[36], offset); + w[56] = amd_bytealign (w[34], w[35], offset); + w[55] = amd_bytealign (w[33], w[34], offset); + w[54] = amd_bytealign (w[32], w[33], offset); + w[53] = amd_bytealign (w[31], w[32], offset); + w[52] = amd_bytealign (w[30], w[31], offset); + w[51] = amd_bytealign (w[29], w[30], offset); + w[50] = amd_bytealign (w[28], w[29], offset); + w[49] = amd_bytealign (w[27], w[28], offset); + w[48] = amd_bytealign (w[26], w[27], offset); + w[47] = amd_bytealign (w[25], w[26], offset); + w[46] = amd_bytealign (w[24], w[25], offset); + w[45] = amd_bytealign (w[23], w[24], offset); + w[44] = amd_bytealign (w[22], w[23], offset); + w[43] = amd_bytealign (w[21], w[22], offset); + w[42] = amd_bytealign (w[20], w[21], offset); + w[41] = amd_bytealign (w[19], w[20], offset); + w[40] = amd_bytealign (w[18], w[19], offset); + w[39] = amd_bytealign (w[17], w[18], offset); + w[38] = amd_bytealign (w[16], w[17], offset); + w[37] = amd_bytealign (w[15], w[16], offset); + w[36] = amd_bytealign (w[14], w[15], offset); + w[35] = amd_bytealign (w[13], w[14], offset); + w[34] = amd_bytealign (w[12], w[13], offset); + w[33] = amd_bytealign (w[11], w[12], offset); + w[32] = amd_bytealign (w[10], w[11], offset); + w[31] = amd_bytealign (w[ 9], w[10], offset); + w[30] = amd_bytealign (w[ 8], w[ 9], offset); + w[29] = amd_bytealign (w[ 7], w[ 8], offset); + w[28] = amd_bytealign (w[ 6], w[ 7], offset); + w[27] = amd_bytealign (w[ 5], w[ 6], offset); + w[26] = amd_bytealign (w[ 4], w[ 5], offset); + w[25] = amd_bytealign (w[ 3], w[ 4], offset); + w[24] = amd_bytealign (w[ 2], w[ 3], offset); + w[23] = amd_bytealign (w[ 1], w[ 2], offset); + w[22] = amd_bytealign (w[ 0], w[ 1], offset); + w[21] = amd_bytealign ( 0, w[ 0], offset); + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 22: + w[63] = amd_bytealign (w[40], w[41], offset); + w[62] = amd_bytealign (w[39], w[40], offset); + w[61] = amd_bytealign (w[38], w[39], offset); + w[60] = amd_bytealign (w[37], w[38], offset); + w[59] = amd_bytealign (w[36], w[37], offset); + w[58] = amd_bytealign (w[35], w[36], offset); + w[57] = amd_bytealign (w[34], w[35], offset); + w[56] = amd_bytealign (w[33], w[34], offset); + w[55] = amd_bytealign (w[32], w[33], offset); + w[54] = amd_bytealign (w[31], w[32], offset); + w[53] = amd_bytealign (w[30], w[31], offset); + w[52] = amd_bytealign (w[29], w[30], offset); + w[51] = amd_bytealign (w[28], w[29], offset); + w[50] = amd_bytealign (w[27], w[28], offset); + w[49] = amd_bytealign (w[26], w[27], offset); + w[48] = amd_bytealign (w[25], w[26], offset); + w[47] = amd_bytealign (w[24], w[25], offset); + w[46] = amd_bytealign (w[23], w[24], offset); + w[45] = amd_bytealign (w[22], w[23], offset); + w[44] = amd_bytealign (w[21], w[22], offset); + w[43] = amd_bytealign (w[20], w[21], offset); + w[42] = amd_bytealign (w[19], w[20], offset); + w[41] = amd_bytealign (w[18], w[19], offset); + w[40] = amd_bytealign (w[17], w[18], offset); + w[39] = amd_bytealign (w[16], w[17], offset); + w[38] = amd_bytealign (w[15], w[16], offset); + w[37] = amd_bytealign (w[14], w[15], offset); + w[36] = amd_bytealign (w[13], w[14], offset); + w[35] = amd_bytealign (w[12], w[13], offset); + w[34] = amd_bytealign (w[11], w[12], offset); + w[33] = amd_bytealign (w[10], w[11], offset); + w[32] = amd_bytealign (w[ 9], w[10], offset); + w[31] = amd_bytealign (w[ 8], w[ 9], offset); + w[30] = amd_bytealign (w[ 7], w[ 8], offset); + w[29] = amd_bytealign (w[ 6], w[ 7], offset); + w[28] = amd_bytealign (w[ 5], w[ 6], offset); + w[27] = amd_bytealign (w[ 4], w[ 5], offset); + w[26] = amd_bytealign (w[ 3], w[ 4], offset); + w[25] = amd_bytealign (w[ 2], w[ 3], offset); + w[24] = amd_bytealign (w[ 1], w[ 2], offset); + w[23] = amd_bytealign (w[ 0], w[ 1], offset); + w[22] = amd_bytealign ( 0, w[ 0], offset); + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 23: + w[63] = amd_bytealign (w[39], w[40], offset); + w[62] = amd_bytealign (w[38], w[39], offset); + w[61] = amd_bytealign (w[37], w[38], offset); + w[60] = amd_bytealign (w[36], w[37], offset); + w[59] = amd_bytealign (w[35], w[36], offset); + w[58] = amd_bytealign (w[34], w[35], offset); + w[57] = amd_bytealign (w[33], w[34], offset); + w[56] = amd_bytealign (w[32], w[33], offset); + w[55] = amd_bytealign (w[31], w[32], offset); + w[54] = amd_bytealign (w[30], w[31], offset); + w[53] = amd_bytealign (w[29], w[30], offset); + w[52] = amd_bytealign (w[28], w[29], offset); + w[51] = amd_bytealign (w[27], w[28], offset); + w[50] = amd_bytealign (w[26], w[27], offset); + w[49] = amd_bytealign (w[25], w[26], offset); + w[48] = amd_bytealign (w[24], w[25], offset); + w[47] = amd_bytealign (w[23], w[24], offset); + w[46] = amd_bytealign (w[22], w[23], offset); + w[45] = amd_bytealign (w[21], w[22], offset); + w[44] = amd_bytealign (w[20], w[21], offset); + w[43] = amd_bytealign (w[19], w[20], offset); + w[42] = amd_bytealign (w[18], w[19], offset); + w[41] = amd_bytealign (w[17], w[18], offset); + w[40] = amd_bytealign (w[16], w[17], offset); + w[39] = amd_bytealign (w[15], w[16], offset); + w[38] = amd_bytealign (w[14], w[15], offset); + w[37] = amd_bytealign (w[13], w[14], offset); + w[36] = amd_bytealign (w[12], w[13], offset); + w[35] = amd_bytealign (w[11], w[12], offset); + w[34] = amd_bytealign (w[10], w[11], offset); + w[33] = amd_bytealign (w[ 9], w[10], offset); + w[32] = amd_bytealign (w[ 8], w[ 9], offset); + w[31] = amd_bytealign (w[ 7], w[ 8], offset); + w[30] = amd_bytealign (w[ 6], w[ 7], offset); + w[29] = amd_bytealign (w[ 5], w[ 6], offset); + w[28] = amd_bytealign (w[ 4], w[ 5], offset); + w[27] = amd_bytealign (w[ 3], w[ 4], offset); + w[26] = amd_bytealign (w[ 2], w[ 3], offset); + w[25] = amd_bytealign (w[ 1], w[ 2], offset); + w[24] = amd_bytealign (w[ 0], w[ 1], offset); + w[23] = amd_bytealign ( 0, w[ 0], offset); + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 24: + w[63] = amd_bytealign (w[38], w[39], offset); + w[62] = amd_bytealign (w[37], w[38], offset); + w[61] = amd_bytealign (w[36], w[37], offset); + w[60] = amd_bytealign (w[35], w[36], offset); + w[59] = amd_bytealign (w[34], w[35], offset); + w[58] = amd_bytealign (w[33], w[34], offset); + w[57] = amd_bytealign (w[32], w[33], offset); + w[56] = amd_bytealign (w[31], w[32], offset); + w[55] = amd_bytealign (w[30], w[31], offset); + w[54] = amd_bytealign (w[29], w[30], offset); + w[53] = amd_bytealign (w[28], w[29], offset); + w[52] = amd_bytealign (w[27], w[28], offset); + w[51] = amd_bytealign (w[26], w[27], offset); + w[50] = amd_bytealign (w[25], w[26], offset); + w[49] = amd_bytealign (w[24], w[25], offset); + w[48] = amd_bytealign (w[23], w[24], offset); + w[47] = amd_bytealign (w[22], w[23], offset); + w[46] = amd_bytealign (w[21], w[22], offset); + w[45] = amd_bytealign (w[20], w[21], offset); + w[44] = amd_bytealign (w[19], w[20], offset); + w[43] = amd_bytealign (w[18], w[19], offset); + w[42] = amd_bytealign (w[17], w[18], offset); + w[41] = amd_bytealign (w[16], w[17], offset); + w[40] = amd_bytealign (w[15], w[16], offset); + w[39] = amd_bytealign (w[14], w[15], offset); + w[38] = amd_bytealign (w[13], w[14], offset); + w[37] = amd_bytealign (w[12], w[13], offset); + w[36] = amd_bytealign (w[11], w[12], offset); + w[35] = amd_bytealign (w[10], w[11], offset); + w[34] = amd_bytealign (w[ 9], w[10], offset); + w[33] = amd_bytealign (w[ 8], w[ 9], offset); + w[32] = amd_bytealign (w[ 7], w[ 8], offset); + w[31] = amd_bytealign (w[ 6], w[ 7], offset); + w[30] = amd_bytealign (w[ 5], w[ 6], offset); + w[29] = amd_bytealign (w[ 4], w[ 5], offset); + w[28] = amd_bytealign (w[ 3], w[ 4], offset); + w[27] = amd_bytealign (w[ 2], w[ 3], offset); + w[26] = amd_bytealign (w[ 1], w[ 2], offset); + w[25] = amd_bytealign (w[ 0], w[ 1], offset); + w[24] = amd_bytealign ( 0, w[ 0], offset); + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 25: + w[63] = amd_bytealign (w[37], w[38], offset); + w[62] = amd_bytealign (w[36], w[37], offset); + w[61] = amd_bytealign (w[35], w[36], offset); + w[60] = amd_bytealign (w[34], w[35], offset); + w[59] = amd_bytealign (w[33], w[34], offset); + w[58] = amd_bytealign (w[32], w[33], offset); + w[57] = amd_bytealign (w[31], w[32], offset); + w[56] = amd_bytealign (w[30], w[31], offset); + w[55] = amd_bytealign (w[29], w[30], offset); + w[54] = amd_bytealign (w[28], w[29], offset); + w[53] = amd_bytealign (w[27], w[28], offset); + w[52] = amd_bytealign (w[26], w[27], offset); + w[51] = amd_bytealign (w[25], w[26], offset); + w[50] = amd_bytealign (w[24], w[25], offset); + w[49] = amd_bytealign (w[23], w[24], offset); + w[48] = amd_bytealign (w[22], w[23], offset); + w[47] = amd_bytealign (w[21], w[22], offset); + w[46] = amd_bytealign (w[20], w[21], offset); + w[45] = amd_bytealign (w[19], w[20], offset); + w[44] = amd_bytealign (w[18], w[19], offset); + w[43] = amd_bytealign (w[17], w[18], offset); + w[42] = amd_bytealign (w[16], w[17], offset); + w[41] = amd_bytealign (w[15], w[16], offset); + w[40] = amd_bytealign (w[14], w[15], offset); + w[39] = amd_bytealign (w[13], w[14], offset); + w[38] = amd_bytealign (w[12], w[13], offset); + w[37] = amd_bytealign (w[11], w[12], offset); + w[36] = amd_bytealign (w[10], w[11], offset); + w[35] = amd_bytealign (w[ 9], w[10], offset); + w[34] = amd_bytealign (w[ 8], w[ 9], offset); + w[33] = amd_bytealign (w[ 7], w[ 8], offset); + w[32] = amd_bytealign (w[ 6], w[ 7], offset); + w[31] = amd_bytealign (w[ 5], w[ 6], offset); + w[30] = amd_bytealign (w[ 4], w[ 5], offset); + w[29] = amd_bytealign (w[ 3], w[ 4], offset); + w[28] = amd_bytealign (w[ 2], w[ 3], offset); + w[27] = amd_bytealign (w[ 1], w[ 2], offset); + w[26] = amd_bytealign (w[ 0], w[ 1], offset); + w[25] = amd_bytealign ( 0, w[ 0], offset); + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 26: + w[63] = amd_bytealign (w[36], w[37], offset); + w[62] = amd_bytealign (w[35], w[36], offset); + w[61] = amd_bytealign (w[34], w[35], offset); + w[60] = amd_bytealign (w[33], w[34], offset); + w[59] = amd_bytealign (w[32], w[33], offset); + w[58] = amd_bytealign (w[31], w[32], offset); + w[57] = amd_bytealign (w[30], w[31], offset); + w[56] = amd_bytealign (w[29], w[30], offset); + w[55] = amd_bytealign (w[28], w[29], offset); + w[54] = amd_bytealign (w[27], w[28], offset); + w[53] = amd_bytealign (w[26], w[27], offset); + w[52] = amd_bytealign (w[25], w[26], offset); + w[51] = amd_bytealign (w[24], w[25], offset); + w[50] = amd_bytealign (w[23], w[24], offset); + w[49] = amd_bytealign (w[22], w[23], offset); + w[48] = amd_bytealign (w[21], w[22], offset); + w[47] = amd_bytealign (w[20], w[21], offset); + w[46] = amd_bytealign (w[19], w[20], offset); + w[45] = amd_bytealign (w[18], w[19], offset); + w[44] = amd_bytealign (w[17], w[18], offset); + w[43] = amd_bytealign (w[16], w[17], offset); + w[42] = amd_bytealign (w[15], w[16], offset); + w[41] = amd_bytealign (w[14], w[15], offset); + w[40] = amd_bytealign (w[13], w[14], offset); + w[39] = amd_bytealign (w[12], w[13], offset); + w[38] = amd_bytealign (w[11], w[12], offset); + w[37] = amd_bytealign (w[10], w[11], offset); + w[36] = amd_bytealign (w[ 9], w[10], offset); + w[35] = amd_bytealign (w[ 8], w[ 9], offset); + w[34] = amd_bytealign (w[ 7], w[ 8], offset); + w[33] = amd_bytealign (w[ 6], w[ 7], offset); + w[32] = amd_bytealign (w[ 5], w[ 6], offset); + w[31] = amd_bytealign (w[ 4], w[ 5], offset); + w[30] = amd_bytealign (w[ 3], w[ 4], offset); + w[29] = amd_bytealign (w[ 2], w[ 3], offset); + w[28] = amd_bytealign (w[ 1], w[ 2], offset); + w[27] = amd_bytealign (w[ 0], w[ 1], offset); + w[26] = amd_bytealign ( 0, w[ 0], offset); + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 27: + w[63] = amd_bytealign (w[35], w[36], offset); + w[62] = amd_bytealign (w[34], w[35], offset); + w[61] = amd_bytealign (w[33], w[34], offset); + w[60] = amd_bytealign (w[32], w[33], offset); + w[59] = amd_bytealign (w[31], w[32], offset); + w[58] = amd_bytealign (w[30], w[31], offset); + w[57] = amd_bytealign (w[29], w[30], offset); + w[56] = amd_bytealign (w[28], w[29], offset); + w[55] = amd_bytealign (w[27], w[28], offset); + w[54] = amd_bytealign (w[26], w[27], offset); + w[53] = amd_bytealign (w[25], w[26], offset); + w[52] = amd_bytealign (w[24], w[25], offset); + w[51] = amd_bytealign (w[23], w[24], offset); + w[50] = amd_bytealign (w[22], w[23], offset); + w[49] = amd_bytealign (w[21], w[22], offset); + w[48] = amd_bytealign (w[20], w[21], offset); + w[47] = amd_bytealign (w[19], w[20], offset); + w[46] = amd_bytealign (w[18], w[19], offset); + w[45] = amd_bytealign (w[17], w[18], offset); + w[44] = amd_bytealign (w[16], w[17], offset); + w[43] = amd_bytealign (w[15], w[16], offset); + w[42] = amd_bytealign (w[14], w[15], offset); + w[41] = amd_bytealign (w[13], w[14], offset); + w[40] = amd_bytealign (w[12], w[13], offset); + w[39] = amd_bytealign (w[11], w[12], offset); + w[38] = amd_bytealign (w[10], w[11], offset); + w[37] = amd_bytealign (w[ 9], w[10], offset); + w[36] = amd_bytealign (w[ 8], w[ 9], offset); + w[35] = amd_bytealign (w[ 7], w[ 8], offset); + w[34] = amd_bytealign (w[ 6], w[ 7], offset); + w[33] = amd_bytealign (w[ 5], w[ 6], offset); + w[32] = amd_bytealign (w[ 4], w[ 5], offset); + w[31] = amd_bytealign (w[ 3], w[ 4], offset); + w[30] = amd_bytealign (w[ 2], w[ 3], offset); + w[29] = amd_bytealign (w[ 1], w[ 2], offset); + w[28] = amd_bytealign (w[ 0], w[ 1], offset); + w[27] = amd_bytealign ( 0, w[ 0], offset); + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 28: + w[63] = amd_bytealign (w[34], w[35], offset); + w[62] = amd_bytealign (w[33], w[34], offset); + w[61] = amd_bytealign (w[32], w[33], offset); + w[60] = amd_bytealign (w[31], w[32], offset); + w[59] = amd_bytealign (w[30], w[31], offset); + w[58] = amd_bytealign (w[29], w[30], offset); + w[57] = amd_bytealign (w[28], w[29], offset); + w[56] = amd_bytealign (w[27], w[28], offset); + w[55] = amd_bytealign (w[26], w[27], offset); + w[54] = amd_bytealign (w[25], w[26], offset); + w[53] = amd_bytealign (w[24], w[25], offset); + w[52] = amd_bytealign (w[23], w[24], offset); + w[51] = amd_bytealign (w[22], w[23], offset); + w[50] = amd_bytealign (w[21], w[22], offset); + w[49] = amd_bytealign (w[20], w[21], offset); + w[48] = amd_bytealign (w[19], w[20], offset); + w[47] = amd_bytealign (w[18], w[19], offset); + w[46] = amd_bytealign (w[17], w[18], offset); + w[45] = amd_bytealign (w[16], w[17], offset); + w[44] = amd_bytealign (w[15], w[16], offset); + w[43] = amd_bytealign (w[14], w[15], offset); + w[42] = amd_bytealign (w[13], w[14], offset); + w[41] = amd_bytealign (w[12], w[13], offset); + w[40] = amd_bytealign (w[11], w[12], offset); + w[39] = amd_bytealign (w[10], w[11], offset); + w[38] = amd_bytealign (w[ 9], w[10], offset); + w[37] = amd_bytealign (w[ 8], w[ 9], offset); + w[36] = amd_bytealign (w[ 7], w[ 8], offset); + w[35] = amd_bytealign (w[ 6], w[ 7], offset); + w[34] = amd_bytealign (w[ 5], w[ 6], offset); + w[33] = amd_bytealign (w[ 4], w[ 5], offset); + w[32] = amd_bytealign (w[ 3], w[ 4], offset); + w[31] = amd_bytealign (w[ 2], w[ 3], offset); + w[30] = amd_bytealign (w[ 1], w[ 2], offset); + w[29] = amd_bytealign (w[ 0], w[ 1], offset); + w[28] = amd_bytealign ( 0, w[ 0], offset); + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 29: + w[63] = amd_bytealign (w[33], w[34], offset); + w[62] = amd_bytealign (w[32], w[33], offset); + w[61] = amd_bytealign (w[31], w[32], offset); + w[60] = amd_bytealign (w[30], w[31], offset); + w[59] = amd_bytealign (w[29], w[30], offset); + w[58] = amd_bytealign (w[28], w[29], offset); + w[57] = amd_bytealign (w[27], w[28], offset); + w[56] = amd_bytealign (w[26], w[27], offset); + w[55] = amd_bytealign (w[25], w[26], offset); + w[54] = amd_bytealign (w[24], w[25], offset); + w[53] = amd_bytealign (w[23], w[24], offset); + w[52] = amd_bytealign (w[22], w[23], offset); + w[51] = amd_bytealign (w[21], w[22], offset); + w[50] = amd_bytealign (w[20], w[21], offset); + w[49] = amd_bytealign (w[19], w[20], offset); + w[48] = amd_bytealign (w[18], w[19], offset); + w[47] = amd_bytealign (w[17], w[18], offset); + w[46] = amd_bytealign (w[16], w[17], offset); + w[45] = amd_bytealign (w[15], w[16], offset); + w[44] = amd_bytealign (w[14], w[15], offset); + w[43] = amd_bytealign (w[13], w[14], offset); + w[42] = amd_bytealign (w[12], w[13], offset); + w[41] = amd_bytealign (w[11], w[12], offset); + w[40] = amd_bytealign (w[10], w[11], offset); + w[39] = amd_bytealign (w[ 9], w[10], offset); + w[38] = amd_bytealign (w[ 8], w[ 9], offset); + w[37] = amd_bytealign (w[ 7], w[ 8], offset); + w[36] = amd_bytealign (w[ 6], w[ 7], offset); + w[35] = amd_bytealign (w[ 5], w[ 6], offset); + w[34] = amd_bytealign (w[ 4], w[ 5], offset); + w[33] = amd_bytealign (w[ 3], w[ 4], offset); + w[32] = amd_bytealign (w[ 2], w[ 3], offset); + w[31] = amd_bytealign (w[ 1], w[ 2], offset); + w[30] = amd_bytealign (w[ 0], w[ 1], offset); + w[29] = amd_bytealign ( 0, w[ 0], offset); + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 30: + w[63] = amd_bytealign (w[32], w[33], offset); + w[62] = amd_bytealign (w[31], w[32], offset); + w[61] = amd_bytealign (w[30], w[31], offset); + w[60] = amd_bytealign (w[29], w[30], offset); + w[59] = amd_bytealign (w[28], w[29], offset); + w[58] = amd_bytealign (w[27], w[28], offset); + w[57] = amd_bytealign (w[26], w[27], offset); + w[56] = amd_bytealign (w[25], w[26], offset); + w[55] = amd_bytealign (w[24], w[25], offset); + w[54] = amd_bytealign (w[23], w[24], offset); + w[53] = amd_bytealign (w[22], w[23], offset); + w[52] = amd_bytealign (w[21], w[22], offset); + w[51] = amd_bytealign (w[20], w[21], offset); + w[50] = amd_bytealign (w[19], w[20], offset); + w[49] = amd_bytealign (w[18], w[19], offset); + w[48] = amd_bytealign (w[17], w[18], offset); + w[47] = amd_bytealign (w[16], w[17], offset); + w[46] = amd_bytealign (w[15], w[16], offset); + w[45] = amd_bytealign (w[14], w[15], offset); + w[44] = amd_bytealign (w[13], w[14], offset); + w[43] = amd_bytealign (w[12], w[13], offset); + w[42] = amd_bytealign (w[11], w[12], offset); + w[41] = amd_bytealign (w[10], w[11], offset); + w[40] = amd_bytealign (w[ 9], w[10], offset); + w[39] = amd_bytealign (w[ 8], w[ 9], offset); + w[38] = amd_bytealign (w[ 7], w[ 8], offset); + w[37] = amd_bytealign (w[ 6], w[ 7], offset); + w[36] = amd_bytealign (w[ 5], w[ 6], offset); + w[35] = amd_bytealign (w[ 4], w[ 5], offset); + w[34] = amd_bytealign (w[ 3], w[ 4], offset); + w[33] = amd_bytealign (w[ 2], w[ 3], offset); + w[32] = amd_bytealign (w[ 1], w[ 2], offset); + w[31] = amd_bytealign (w[ 0], w[ 1], offset); + w[30] = amd_bytealign ( 0, w[ 0], offset); + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 31: + w[63] = amd_bytealign (w[31], w[32], offset); + w[62] = amd_bytealign (w[30], w[31], offset); + w[61] = amd_bytealign (w[29], w[30], offset); + w[60] = amd_bytealign (w[28], w[29], offset); + w[59] = amd_bytealign (w[27], w[28], offset); + w[58] = amd_bytealign (w[26], w[27], offset); + w[57] = amd_bytealign (w[25], w[26], offset); + w[56] = amd_bytealign (w[24], w[25], offset); + w[55] = amd_bytealign (w[23], w[24], offset); + w[54] = amd_bytealign (w[22], w[23], offset); + w[53] = amd_bytealign (w[21], w[22], offset); + w[52] = amd_bytealign (w[20], w[21], offset); + w[51] = amd_bytealign (w[19], w[20], offset); + w[50] = amd_bytealign (w[18], w[19], offset); + w[49] = amd_bytealign (w[17], w[18], offset); + w[48] = amd_bytealign (w[16], w[17], offset); + w[47] = amd_bytealign (w[15], w[16], offset); + w[46] = amd_bytealign (w[14], w[15], offset); + w[45] = amd_bytealign (w[13], w[14], offset); + w[44] = amd_bytealign (w[12], w[13], offset); + w[43] = amd_bytealign (w[11], w[12], offset); + w[42] = amd_bytealign (w[10], w[11], offset); + w[41] = amd_bytealign (w[ 9], w[10], offset); + w[40] = amd_bytealign (w[ 8], w[ 9], offset); + w[39] = amd_bytealign (w[ 7], w[ 8], offset); + w[38] = amd_bytealign (w[ 6], w[ 7], offset); + w[37] = amd_bytealign (w[ 5], w[ 6], offset); + w[36] = amd_bytealign (w[ 4], w[ 5], offset); + w[35] = amd_bytealign (w[ 3], w[ 4], offset); + w[34] = amd_bytealign (w[ 2], w[ 3], offset); + w[33] = amd_bytealign (w[ 1], w[ 2], offset); + w[32] = amd_bytealign (w[ 0], w[ 1], offset); + w[31] = amd_bytealign ( 0, w[ 0], offset); + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 32: + w[63] = amd_bytealign (w[30], w[31], offset); + w[62] = amd_bytealign (w[29], w[30], offset); + w[61] = amd_bytealign (w[28], w[29], offset); + w[60] = amd_bytealign (w[27], w[28], offset); + w[59] = amd_bytealign (w[26], w[27], offset); + w[58] = amd_bytealign (w[25], w[26], offset); + w[57] = amd_bytealign (w[24], w[25], offset); + w[56] = amd_bytealign (w[23], w[24], offset); + w[55] = amd_bytealign (w[22], w[23], offset); + w[54] = amd_bytealign (w[21], w[22], offset); + w[53] = amd_bytealign (w[20], w[21], offset); + w[52] = amd_bytealign (w[19], w[20], offset); + w[51] = amd_bytealign (w[18], w[19], offset); + w[50] = amd_bytealign (w[17], w[18], offset); + w[49] = amd_bytealign (w[16], w[17], offset); + w[48] = amd_bytealign (w[15], w[16], offset); + w[47] = amd_bytealign (w[14], w[15], offset); + w[46] = amd_bytealign (w[13], w[14], offset); + w[45] = amd_bytealign (w[12], w[13], offset); + w[44] = amd_bytealign (w[11], w[12], offset); + w[43] = amd_bytealign (w[10], w[11], offset); + w[42] = amd_bytealign (w[ 9], w[10], offset); + w[41] = amd_bytealign (w[ 8], w[ 9], offset); + w[40] = amd_bytealign (w[ 7], w[ 8], offset); + w[39] = amd_bytealign (w[ 6], w[ 7], offset); + w[38] = amd_bytealign (w[ 5], w[ 6], offset); + w[37] = amd_bytealign (w[ 4], w[ 5], offset); + w[36] = amd_bytealign (w[ 3], w[ 4], offset); + w[35] = amd_bytealign (w[ 2], w[ 3], offset); + w[34] = amd_bytealign (w[ 1], w[ 2], offset); + w[33] = amd_bytealign (w[ 0], w[ 1], offset); + w[32] = amd_bytealign ( 0, w[ 0], offset); + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 33: + w[63] = amd_bytealign (w[29], w[30], offset); + w[62] = amd_bytealign (w[28], w[29], offset); + w[61] = amd_bytealign (w[27], w[28], offset); + w[60] = amd_bytealign (w[26], w[27], offset); + w[59] = amd_bytealign (w[25], w[26], offset); + w[58] = amd_bytealign (w[24], w[25], offset); + w[57] = amd_bytealign (w[23], w[24], offset); + w[56] = amd_bytealign (w[22], w[23], offset); + w[55] = amd_bytealign (w[21], w[22], offset); + w[54] = amd_bytealign (w[20], w[21], offset); + w[53] = amd_bytealign (w[19], w[20], offset); + w[52] = amd_bytealign (w[18], w[19], offset); + w[51] = amd_bytealign (w[17], w[18], offset); + w[50] = amd_bytealign (w[16], w[17], offset); + w[49] = amd_bytealign (w[15], w[16], offset); + w[48] = amd_bytealign (w[14], w[15], offset); + w[47] = amd_bytealign (w[13], w[14], offset); + w[46] = amd_bytealign (w[12], w[13], offset); + w[45] = amd_bytealign (w[11], w[12], offset); + w[44] = amd_bytealign (w[10], w[11], offset); + w[43] = amd_bytealign (w[ 9], w[10], offset); + w[42] = amd_bytealign (w[ 8], w[ 9], offset); + w[41] = amd_bytealign (w[ 7], w[ 8], offset); + w[40] = amd_bytealign (w[ 6], w[ 7], offset); + w[39] = amd_bytealign (w[ 5], w[ 6], offset); + w[38] = amd_bytealign (w[ 4], w[ 5], offset); + w[37] = amd_bytealign (w[ 3], w[ 4], offset); + w[36] = amd_bytealign (w[ 2], w[ 3], offset); + w[35] = amd_bytealign (w[ 1], w[ 2], offset); + w[34] = amd_bytealign (w[ 0], w[ 1], offset); + w[33] = amd_bytealign ( 0, w[ 0], offset); + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 34: + w[63] = amd_bytealign (w[28], w[29], offset); + w[62] = amd_bytealign (w[27], w[28], offset); + w[61] = amd_bytealign (w[26], w[27], offset); + w[60] = amd_bytealign (w[25], w[26], offset); + w[59] = amd_bytealign (w[24], w[25], offset); + w[58] = amd_bytealign (w[23], w[24], offset); + w[57] = amd_bytealign (w[22], w[23], offset); + w[56] = amd_bytealign (w[21], w[22], offset); + w[55] = amd_bytealign (w[20], w[21], offset); + w[54] = amd_bytealign (w[19], w[20], offset); + w[53] = amd_bytealign (w[18], w[19], offset); + w[52] = amd_bytealign (w[17], w[18], offset); + w[51] = amd_bytealign (w[16], w[17], offset); + w[50] = amd_bytealign (w[15], w[16], offset); + w[49] = amd_bytealign (w[14], w[15], offset); + w[48] = amd_bytealign (w[13], w[14], offset); + w[47] = amd_bytealign (w[12], w[13], offset); + w[46] = amd_bytealign (w[11], w[12], offset); + w[45] = amd_bytealign (w[10], w[11], offset); + w[44] = amd_bytealign (w[ 9], w[10], offset); + w[43] = amd_bytealign (w[ 8], w[ 9], offset); + w[42] = amd_bytealign (w[ 7], w[ 8], offset); + w[41] = amd_bytealign (w[ 6], w[ 7], offset); + w[40] = amd_bytealign (w[ 5], w[ 6], offset); + w[39] = amd_bytealign (w[ 4], w[ 5], offset); + w[38] = amd_bytealign (w[ 3], w[ 4], offset); + w[37] = amd_bytealign (w[ 2], w[ 3], offset); + w[36] = amd_bytealign (w[ 1], w[ 2], offset); + w[35] = amd_bytealign (w[ 0], w[ 1], offset); + w[34] = amd_bytealign ( 0, w[ 0], offset); + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 35: + w[63] = amd_bytealign (w[27], w[28], offset); + w[62] = amd_bytealign (w[26], w[27], offset); + w[61] = amd_bytealign (w[25], w[26], offset); + w[60] = amd_bytealign (w[24], w[25], offset); + w[59] = amd_bytealign (w[23], w[24], offset); + w[58] = amd_bytealign (w[22], w[23], offset); + w[57] = amd_bytealign (w[21], w[22], offset); + w[56] = amd_bytealign (w[20], w[21], offset); + w[55] = amd_bytealign (w[19], w[20], offset); + w[54] = amd_bytealign (w[18], w[19], offset); + w[53] = amd_bytealign (w[17], w[18], offset); + w[52] = amd_bytealign (w[16], w[17], offset); + w[51] = amd_bytealign (w[15], w[16], offset); + w[50] = amd_bytealign (w[14], w[15], offset); + w[49] = amd_bytealign (w[13], w[14], offset); + w[48] = amd_bytealign (w[12], w[13], offset); + w[47] = amd_bytealign (w[11], w[12], offset); + w[46] = amd_bytealign (w[10], w[11], offset); + w[45] = amd_bytealign (w[ 9], w[10], offset); + w[44] = amd_bytealign (w[ 8], w[ 9], offset); + w[43] = amd_bytealign (w[ 7], w[ 8], offset); + w[42] = amd_bytealign (w[ 6], w[ 7], offset); + w[41] = amd_bytealign (w[ 5], w[ 6], offset); + w[40] = amd_bytealign (w[ 4], w[ 5], offset); + w[39] = amd_bytealign (w[ 3], w[ 4], offset); + w[38] = amd_bytealign (w[ 2], w[ 3], offset); + w[37] = amd_bytealign (w[ 1], w[ 2], offset); + w[36] = amd_bytealign (w[ 0], w[ 1], offset); + w[35] = amd_bytealign ( 0, w[ 0], offset); + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 36: + w[63] = amd_bytealign (w[26], w[27], offset); + w[62] = amd_bytealign (w[25], w[26], offset); + w[61] = amd_bytealign (w[24], w[25], offset); + w[60] = amd_bytealign (w[23], w[24], offset); + w[59] = amd_bytealign (w[22], w[23], offset); + w[58] = amd_bytealign (w[21], w[22], offset); + w[57] = amd_bytealign (w[20], w[21], offset); + w[56] = amd_bytealign (w[19], w[20], offset); + w[55] = amd_bytealign (w[18], w[19], offset); + w[54] = amd_bytealign (w[17], w[18], offset); + w[53] = amd_bytealign (w[16], w[17], offset); + w[52] = amd_bytealign (w[15], w[16], offset); + w[51] = amd_bytealign (w[14], w[15], offset); + w[50] = amd_bytealign (w[13], w[14], offset); + w[49] = amd_bytealign (w[12], w[13], offset); + w[48] = amd_bytealign (w[11], w[12], offset); + w[47] = amd_bytealign (w[10], w[11], offset); + w[46] = amd_bytealign (w[ 9], w[10], offset); + w[45] = amd_bytealign (w[ 8], w[ 9], offset); + w[44] = amd_bytealign (w[ 7], w[ 8], offset); + w[43] = amd_bytealign (w[ 6], w[ 7], offset); + w[42] = amd_bytealign (w[ 5], w[ 6], offset); + w[41] = amd_bytealign (w[ 4], w[ 5], offset); + w[40] = amd_bytealign (w[ 3], w[ 4], offset); + w[39] = amd_bytealign (w[ 2], w[ 3], offset); + w[38] = amd_bytealign (w[ 1], w[ 2], offset); + w[37] = amd_bytealign (w[ 0], w[ 1], offset); + w[36] = amd_bytealign ( 0, w[ 0], offset); + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 37: + w[63] = amd_bytealign (w[25], w[26], offset); + w[62] = amd_bytealign (w[24], w[25], offset); + w[61] = amd_bytealign (w[23], w[24], offset); + w[60] = amd_bytealign (w[22], w[23], offset); + w[59] = amd_bytealign (w[21], w[22], offset); + w[58] = amd_bytealign (w[20], w[21], offset); + w[57] = amd_bytealign (w[19], w[20], offset); + w[56] = amd_bytealign (w[18], w[19], offset); + w[55] = amd_bytealign (w[17], w[18], offset); + w[54] = amd_bytealign (w[16], w[17], offset); + w[53] = amd_bytealign (w[15], w[16], offset); + w[52] = amd_bytealign (w[14], w[15], offset); + w[51] = amd_bytealign (w[13], w[14], offset); + w[50] = amd_bytealign (w[12], w[13], offset); + w[49] = amd_bytealign (w[11], w[12], offset); + w[48] = amd_bytealign (w[10], w[11], offset); + w[47] = amd_bytealign (w[ 9], w[10], offset); + w[46] = amd_bytealign (w[ 8], w[ 9], offset); + w[45] = amd_bytealign (w[ 7], w[ 8], offset); + w[44] = amd_bytealign (w[ 6], w[ 7], offset); + w[43] = amd_bytealign (w[ 5], w[ 6], offset); + w[42] = amd_bytealign (w[ 4], w[ 5], offset); + w[41] = amd_bytealign (w[ 3], w[ 4], offset); + w[40] = amd_bytealign (w[ 2], w[ 3], offset); + w[39] = amd_bytealign (w[ 1], w[ 2], offset); + w[38] = amd_bytealign (w[ 0], w[ 1], offset); + w[37] = amd_bytealign ( 0, w[ 0], offset); + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 38: + w[63] = amd_bytealign (w[24], w[25], offset); + w[62] = amd_bytealign (w[23], w[24], offset); + w[61] = amd_bytealign (w[22], w[23], offset); + w[60] = amd_bytealign (w[21], w[22], offset); + w[59] = amd_bytealign (w[20], w[21], offset); + w[58] = amd_bytealign (w[19], w[20], offset); + w[57] = amd_bytealign (w[18], w[19], offset); + w[56] = amd_bytealign (w[17], w[18], offset); + w[55] = amd_bytealign (w[16], w[17], offset); + w[54] = amd_bytealign (w[15], w[16], offset); + w[53] = amd_bytealign (w[14], w[15], offset); + w[52] = amd_bytealign (w[13], w[14], offset); + w[51] = amd_bytealign (w[12], w[13], offset); + w[50] = amd_bytealign (w[11], w[12], offset); + w[49] = amd_bytealign (w[10], w[11], offset); + w[48] = amd_bytealign (w[ 9], w[10], offset); + w[47] = amd_bytealign (w[ 8], w[ 9], offset); + w[46] = amd_bytealign (w[ 7], w[ 8], offset); + w[45] = amd_bytealign (w[ 6], w[ 7], offset); + w[44] = amd_bytealign (w[ 5], w[ 6], offset); + w[43] = amd_bytealign (w[ 4], w[ 5], offset); + w[42] = amd_bytealign (w[ 3], w[ 4], offset); + w[41] = amd_bytealign (w[ 2], w[ 3], offset); + w[40] = amd_bytealign (w[ 1], w[ 2], offset); + w[39] = amd_bytealign (w[ 0], w[ 1], offset); + w[38] = amd_bytealign ( 0, w[ 0], offset); + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 39: + w[63] = amd_bytealign (w[23], w[24], offset); + w[62] = amd_bytealign (w[22], w[23], offset); + w[61] = amd_bytealign (w[21], w[22], offset); + w[60] = amd_bytealign (w[20], w[21], offset); + w[59] = amd_bytealign (w[19], w[20], offset); + w[58] = amd_bytealign (w[18], w[19], offset); + w[57] = amd_bytealign (w[17], w[18], offset); + w[56] = amd_bytealign (w[16], w[17], offset); + w[55] = amd_bytealign (w[15], w[16], offset); + w[54] = amd_bytealign (w[14], w[15], offset); + w[53] = amd_bytealign (w[13], w[14], offset); + w[52] = amd_bytealign (w[12], w[13], offset); + w[51] = amd_bytealign (w[11], w[12], offset); + w[50] = amd_bytealign (w[10], w[11], offset); + w[49] = amd_bytealign (w[ 9], w[10], offset); + w[48] = amd_bytealign (w[ 8], w[ 9], offset); + w[47] = amd_bytealign (w[ 7], w[ 8], offset); + w[46] = amd_bytealign (w[ 6], w[ 7], offset); + w[45] = amd_bytealign (w[ 5], w[ 6], offset); + w[44] = amd_bytealign (w[ 4], w[ 5], offset); + w[43] = amd_bytealign (w[ 3], w[ 4], offset); + w[42] = amd_bytealign (w[ 2], w[ 3], offset); + w[41] = amd_bytealign (w[ 1], w[ 2], offset); + w[40] = amd_bytealign (w[ 0], w[ 1], offset); + w[39] = amd_bytealign ( 0, w[ 0], offset); + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 40: + w[63] = amd_bytealign (w[22], w[23], offset); + w[62] = amd_bytealign (w[21], w[22], offset); + w[61] = amd_bytealign (w[20], w[21], offset); + w[60] = amd_bytealign (w[19], w[20], offset); + w[59] = amd_bytealign (w[18], w[19], offset); + w[58] = amd_bytealign (w[17], w[18], offset); + w[57] = amd_bytealign (w[16], w[17], offset); + w[56] = amd_bytealign (w[15], w[16], offset); + w[55] = amd_bytealign (w[14], w[15], offset); + w[54] = amd_bytealign (w[13], w[14], offset); + w[53] = amd_bytealign (w[12], w[13], offset); + w[52] = amd_bytealign (w[11], w[12], offset); + w[51] = amd_bytealign (w[10], w[11], offset); + w[50] = amd_bytealign (w[ 9], w[10], offset); + w[49] = amd_bytealign (w[ 8], w[ 9], offset); + w[48] = amd_bytealign (w[ 7], w[ 8], offset); + w[47] = amd_bytealign (w[ 6], w[ 7], offset); + w[46] = amd_bytealign (w[ 5], w[ 6], offset); + w[45] = amd_bytealign (w[ 4], w[ 5], offset); + w[44] = amd_bytealign (w[ 3], w[ 4], offset); + w[43] = amd_bytealign (w[ 2], w[ 3], offset); + w[42] = amd_bytealign (w[ 1], w[ 2], offset); + w[41] = amd_bytealign (w[ 0], w[ 1], offset); + w[40] = amd_bytealign ( 0, w[ 0], offset); + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 41: + w[63] = amd_bytealign (w[21], w[22], offset); + w[62] = amd_bytealign (w[20], w[21], offset); + w[61] = amd_bytealign (w[19], w[20], offset); + w[60] = amd_bytealign (w[18], w[19], offset); + w[59] = amd_bytealign (w[17], w[18], offset); + w[58] = amd_bytealign (w[16], w[17], offset); + w[57] = amd_bytealign (w[15], w[16], offset); + w[56] = amd_bytealign (w[14], w[15], offset); + w[55] = amd_bytealign (w[13], w[14], offset); + w[54] = amd_bytealign (w[12], w[13], offset); + w[53] = amd_bytealign (w[11], w[12], offset); + w[52] = amd_bytealign (w[10], w[11], offset); + w[51] = amd_bytealign (w[ 9], w[10], offset); + w[50] = amd_bytealign (w[ 8], w[ 9], offset); + w[49] = amd_bytealign (w[ 7], w[ 8], offset); + w[48] = amd_bytealign (w[ 6], w[ 7], offset); + w[47] = amd_bytealign (w[ 5], w[ 6], offset); + w[46] = amd_bytealign (w[ 4], w[ 5], offset); + w[45] = amd_bytealign (w[ 3], w[ 4], offset); + w[44] = amd_bytealign (w[ 2], w[ 3], offset); + w[43] = amd_bytealign (w[ 1], w[ 2], offset); + w[42] = amd_bytealign (w[ 0], w[ 1], offset); + w[41] = amd_bytealign ( 0, w[ 0], offset); + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 42: + w[63] = amd_bytealign (w[20], w[21], offset); + w[62] = amd_bytealign (w[19], w[20], offset); + w[61] = amd_bytealign (w[18], w[19], offset); + w[60] = amd_bytealign (w[17], w[18], offset); + w[59] = amd_bytealign (w[16], w[17], offset); + w[58] = amd_bytealign (w[15], w[16], offset); + w[57] = amd_bytealign (w[14], w[15], offset); + w[56] = amd_bytealign (w[13], w[14], offset); + w[55] = amd_bytealign (w[12], w[13], offset); + w[54] = amd_bytealign (w[11], w[12], offset); + w[53] = amd_bytealign (w[10], w[11], offset); + w[52] = amd_bytealign (w[ 9], w[10], offset); + w[51] = amd_bytealign (w[ 8], w[ 9], offset); + w[50] = amd_bytealign (w[ 7], w[ 8], offset); + w[49] = amd_bytealign (w[ 6], w[ 7], offset); + w[48] = amd_bytealign (w[ 5], w[ 6], offset); + w[47] = amd_bytealign (w[ 4], w[ 5], offset); + w[46] = amd_bytealign (w[ 3], w[ 4], offset); + w[45] = amd_bytealign (w[ 2], w[ 3], offset); + w[44] = amd_bytealign (w[ 1], w[ 2], offset); + w[43] = amd_bytealign (w[ 0], w[ 1], offset); + w[42] = amd_bytealign ( 0, w[ 0], offset); + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 43: + w[63] = amd_bytealign (w[19], w[20], offset); + w[62] = amd_bytealign (w[18], w[19], offset); + w[61] = amd_bytealign (w[17], w[18], offset); + w[60] = amd_bytealign (w[16], w[17], offset); + w[59] = amd_bytealign (w[15], w[16], offset); + w[58] = amd_bytealign (w[14], w[15], offset); + w[57] = amd_bytealign (w[13], w[14], offset); + w[56] = amd_bytealign (w[12], w[13], offset); + w[55] = amd_bytealign (w[11], w[12], offset); + w[54] = amd_bytealign (w[10], w[11], offset); + w[53] = amd_bytealign (w[ 9], w[10], offset); + w[52] = amd_bytealign (w[ 8], w[ 9], offset); + w[51] = amd_bytealign (w[ 7], w[ 8], offset); + w[50] = amd_bytealign (w[ 6], w[ 7], offset); + w[49] = amd_bytealign (w[ 5], w[ 6], offset); + w[48] = amd_bytealign (w[ 4], w[ 5], offset); + w[47] = amd_bytealign (w[ 3], w[ 4], offset); + w[46] = amd_bytealign (w[ 2], w[ 3], offset); + w[45] = amd_bytealign (w[ 1], w[ 2], offset); + w[44] = amd_bytealign (w[ 0], w[ 1], offset); + w[43] = amd_bytealign ( 0, w[ 0], offset); + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 44: + w[63] = amd_bytealign (w[18], w[19], offset); + w[62] = amd_bytealign (w[17], w[18], offset); + w[61] = amd_bytealign (w[16], w[17], offset); + w[60] = amd_bytealign (w[15], w[16], offset); + w[59] = amd_bytealign (w[14], w[15], offset); + w[58] = amd_bytealign (w[13], w[14], offset); + w[57] = amd_bytealign (w[12], w[13], offset); + w[56] = amd_bytealign (w[11], w[12], offset); + w[55] = amd_bytealign (w[10], w[11], offset); + w[54] = amd_bytealign (w[ 9], w[10], offset); + w[53] = amd_bytealign (w[ 8], w[ 9], offset); + w[52] = amd_bytealign (w[ 7], w[ 8], offset); + w[51] = amd_bytealign (w[ 6], w[ 7], offset); + w[50] = amd_bytealign (w[ 5], w[ 6], offset); + w[49] = amd_bytealign (w[ 4], w[ 5], offset); + w[48] = amd_bytealign (w[ 3], w[ 4], offset); + w[47] = amd_bytealign (w[ 2], w[ 3], offset); + w[46] = amd_bytealign (w[ 1], w[ 2], offset); + w[45] = amd_bytealign (w[ 0], w[ 1], offset); + w[44] = amd_bytealign ( 0, w[ 0], offset); + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 45: + w[63] = amd_bytealign (w[17], w[18], offset); + w[62] = amd_bytealign (w[16], w[17], offset); + w[61] = amd_bytealign (w[15], w[16], offset); + w[60] = amd_bytealign (w[14], w[15], offset); + w[59] = amd_bytealign (w[13], w[14], offset); + w[58] = amd_bytealign (w[12], w[13], offset); + w[57] = amd_bytealign (w[11], w[12], offset); + w[56] = amd_bytealign (w[10], w[11], offset); + w[55] = amd_bytealign (w[ 9], w[10], offset); + w[54] = amd_bytealign (w[ 8], w[ 9], offset); + w[53] = amd_bytealign (w[ 7], w[ 8], offset); + w[52] = amd_bytealign (w[ 6], w[ 7], offset); + w[51] = amd_bytealign (w[ 5], w[ 6], offset); + w[50] = amd_bytealign (w[ 4], w[ 5], offset); + w[49] = amd_bytealign (w[ 3], w[ 4], offset); + w[48] = amd_bytealign (w[ 2], w[ 3], offset); + w[47] = amd_bytealign (w[ 1], w[ 2], offset); + w[46] = amd_bytealign (w[ 0], w[ 1], offset); + w[45] = amd_bytealign ( 0, w[ 0], offset); + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 46: + w[63] = amd_bytealign (w[16], w[17], offset); + w[62] = amd_bytealign (w[15], w[16], offset); + w[61] = amd_bytealign (w[14], w[15], offset); + w[60] = amd_bytealign (w[13], w[14], offset); + w[59] = amd_bytealign (w[12], w[13], offset); + w[58] = amd_bytealign (w[11], w[12], offset); + w[57] = amd_bytealign (w[10], w[11], offset); + w[56] = amd_bytealign (w[ 9], w[10], offset); + w[55] = amd_bytealign (w[ 8], w[ 9], offset); + w[54] = amd_bytealign (w[ 7], w[ 8], offset); + w[53] = amd_bytealign (w[ 6], w[ 7], offset); + w[52] = amd_bytealign (w[ 5], w[ 6], offset); + w[51] = amd_bytealign (w[ 4], w[ 5], offset); + w[50] = amd_bytealign (w[ 3], w[ 4], offset); + w[49] = amd_bytealign (w[ 2], w[ 3], offset); + w[48] = amd_bytealign (w[ 1], w[ 2], offset); + w[47] = amd_bytealign (w[ 0], w[ 1], offset); + w[46] = amd_bytealign ( 0, w[ 0], offset); + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 47: + w[63] = amd_bytealign (w[15], w[16], offset); + w[62] = amd_bytealign (w[14], w[15], offset); + w[61] = amd_bytealign (w[13], w[14], offset); + w[60] = amd_bytealign (w[12], w[13], offset); + w[59] = amd_bytealign (w[11], w[12], offset); + w[58] = amd_bytealign (w[10], w[11], offset); + w[57] = amd_bytealign (w[ 9], w[10], offset); + w[56] = amd_bytealign (w[ 8], w[ 9], offset); + w[55] = amd_bytealign (w[ 7], w[ 8], offset); + w[54] = amd_bytealign (w[ 6], w[ 7], offset); + w[53] = amd_bytealign (w[ 5], w[ 6], offset); + w[52] = amd_bytealign (w[ 4], w[ 5], offset); + w[51] = amd_bytealign (w[ 3], w[ 4], offset); + w[50] = amd_bytealign (w[ 2], w[ 3], offset); + w[49] = amd_bytealign (w[ 1], w[ 2], offset); + w[48] = amd_bytealign (w[ 0], w[ 1], offset); + w[47] = amd_bytealign ( 0, w[ 0], offset); + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 48: + w[63] = amd_bytealign (w[14], w[15], offset); + w[62] = amd_bytealign (w[13], w[14], offset); + w[61] = amd_bytealign (w[12], w[13], offset); + w[60] = amd_bytealign (w[11], w[12], offset); + w[59] = amd_bytealign (w[10], w[11], offset); + w[58] = amd_bytealign (w[ 9], w[10], offset); + w[57] = amd_bytealign (w[ 8], w[ 9], offset); + w[56] = amd_bytealign (w[ 7], w[ 8], offset); + w[55] = amd_bytealign (w[ 6], w[ 7], offset); + w[54] = amd_bytealign (w[ 5], w[ 6], offset); + w[53] = amd_bytealign (w[ 4], w[ 5], offset); + w[52] = amd_bytealign (w[ 3], w[ 4], offset); + w[51] = amd_bytealign (w[ 2], w[ 3], offset); + w[50] = amd_bytealign (w[ 1], w[ 2], offset); + w[49] = amd_bytealign (w[ 0], w[ 1], offset); + w[48] = amd_bytealign ( 0, w[ 0], offset); + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 49: + w[63] = amd_bytealign (w[13], w[14], offset); + w[62] = amd_bytealign (w[12], w[13], offset); + w[61] = amd_bytealign (w[11], w[12], offset); + w[60] = amd_bytealign (w[10], w[11], offset); + w[59] = amd_bytealign (w[ 9], w[10], offset); + w[58] = amd_bytealign (w[ 8], w[ 9], offset); + w[57] = amd_bytealign (w[ 7], w[ 8], offset); + w[56] = amd_bytealign (w[ 6], w[ 7], offset); + w[55] = amd_bytealign (w[ 5], w[ 6], offset); + w[54] = amd_bytealign (w[ 4], w[ 5], offset); + w[53] = amd_bytealign (w[ 3], w[ 4], offset); + w[52] = amd_bytealign (w[ 2], w[ 3], offset); + w[51] = amd_bytealign (w[ 1], w[ 2], offset); + w[50] = amd_bytealign (w[ 0], w[ 1], offset); + w[49] = amd_bytealign ( 0, w[ 0], offset); + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 50: + w[63] = amd_bytealign (w[12], w[13], offset); + w[62] = amd_bytealign (w[11], w[12], offset); + w[61] = amd_bytealign (w[10], w[11], offset); + w[60] = amd_bytealign (w[ 9], w[10], offset); + w[59] = amd_bytealign (w[ 8], w[ 9], offset); + w[58] = amd_bytealign (w[ 7], w[ 8], offset); + w[57] = amd_bytealign (w[ 6], w[ 7], offset); + w[56] = amd_bytealign (w[ 5], w[ 6], offset); + w[55] = amd_bytealign (w[ 4], w[ 5], offset); + w[54] = amd_bytealign (w[ 3], w[ 4], offset); + w[53] = amd_bytealign (w[ 2], w[ 3], offset); + w[52] = amd_bytealign (w[ 1], w[ 2], offset); + w[51] = amd_bytealign (w[ 0], w[ 1], offset); + w[50] = amd_bytealign ( 0, w[ 0], offset); + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 51: + w[63] = amd_bytealign (w[11], w[12], offset); + w[62] = amd_bytealign (w[10], w[11], offset); + w[61] = amd_bytealign (w[ 9], w[10], offset); + w[60] = amd_bytealign (w[ 8], w[ 9], offset); + w[59] = amd_bytealign (w[ 7], w[ 8], offset); + w[58] = amd_bytealign (w[ 6], w[ 7], offset); + w[57] = amd_bytealign (w[ 5], w[ 6], offset); + w[56] = amd_bytealign (w[ 4], w[ 5], offset); + w[55] = amd_bytealign (w[ 3], w[ 4], offset); + w[54] = amd_bytealign (w[ 2], w[ 3], offset); + w[53] = amd_bytealign (w[ 1], w[ 2], offset); + w[52] = amd_bytealign (w[ 0], w[ 1], offset); + w[51] = amd_bytealign ( 0, w[ 0], offset); + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 52: + w[63] = amd_bytealign (w[10], w[11], offset); + w[62] = amd_bytealign (w[ 9], w[10], offset); + w[61] = amd_bytealign (w[ 8], w[ 9], offset); + w[60] = amd_bytealign (w[ 7], w[ 8], offset); + w[59] = amd_bytealign (w[ 6], w[ 7], offset); + w[58] = amd_bytealign (w[ 5], w[ 6], offset); + w[57] = amd_bytealign (w[ 4], w[ 5], offset); + w[56] = amd_bytealign (w[ 3], w[ 4], offset); + w[55] = amd_bytealign (w[ 2], w[ 3], offset); + w[54] = amd_bytealign (w[ 1], w[ 2], offset); + w[53] = amd_bytealign (w[ 0], w[ 1], offset); + w[52] = amd_bytealign ( 0, w[ 0], offset); + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 53: + w[63] = amd_bytealign (w[ 9], w[10], offset); + w[62] = amd_bytealign (w[ 8], w[ 9], offset); + w[61] = amd_bytealign (w[ 7], w[ 8], offset); + w[60] = amd_bytealign (w[ 6], w[ 7], offset); + w[59] = amd_bytealign (w[ 5], w[ 6], offset); + w[58] = amd_bytealign (w[ 4], w[ 5], offset); + w[57] = amd_bytealign (w[ 3], w[ 4], offset); + w[56] = amd_bytealign (w[ 2], w[ 3], offset); + w[55] = amd_bytealign (w[ 1], w[ 2], offset); + w[54] = amd_bytealign (w[ 0], w[ 1], offset); + w[53] = amd_bytealign ( 0, w[ 0], offset); + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 54: + w[63] = amd_bytealign (w[ 8], w[ 9], offset); + w[62] = amd_bytealign (w[ 7], w[ 8], offset); + w[61] = amd_bytealign (w[ 6], w[ 7], offset); + w[60] = amd_bytealign (w[ 5], w[ 6], offset); + w[59] = amd_bytealign (w[ 4], w[ 5], offset); + w[58] = amd_bytealign (w[ 3], w[ 4], offset); + w[57] = amd_bytealign (w[ 2], w[ 3], offset); + w[56] = amd_bytealign (w[ 1], w[ 2], offset); + w[55] = amd_bytealign (w[ 0], w[ 1], offset); + w[54] = amd_bytealign ( 0, w[ 0], offset); + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 55: + w[63] = amd_bytealign (w[ 7], w[ 8], offset); + w[62] = amd_bytealign (w[ 6], w[ 7], offset); + w[61] = amd_bytealign (w[ 5], w[ 6], offset); + w[60] = amd_bytealign (w[ 4], w[ 5], offset); + w[59] = amd_bytealign (w[ 3], w[ 4], offset); + w[58] = amd_bytealign (w[ 2], w[ 3], offset); + w[57] = amd_bytealign (w[ 1], w[ 2], offset); + w[56] = amd_bytealign (w[ 0], w[ 1], offset); + w[55] = amd_bytealign ( 0, w[ 0], offset); + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 56: + w[63] = amd_bytealign (w[ 6], w[ 7], offset); + w[62] = amd_bytealign (w[ 5], w[ 6], offset); + w[61] = amd_bytealign (w[ 4], w[ 5], offset); + w[60] = amd_bytealign (w[ 3], w[ 4], offset); + w[59] = amd_bytealign (w[ 2], w[ 3], offset); + w[58] = amd_bytealign (w[ 1], w[ 2], offset); + w[57] = amd_bytealign (w[ 0], w[ 1], offset); + w[56] = amd_bytealign ( 0, w[ 0], offset); + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 57: + w[63] = amd_bytealign (w[ 5], w[ 6], offset); + w[62] = amd_bytealign (w[ 4], w[ 5], offset); + w[61] = amd_bytealign (w[ 3], w[ 4], offset); + w[60] = amd_bytealign (w[ 2], w[ 3], offset); + w[59] = amd_bytealign (w[ 1], w[ 2], offset); + w[58] = amd_bytealign (w[ 0], w[ 1], offset); + w[57] = amd_bytealign ( 0, w[ 0], offset); + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 58: + w[63] = amd_bytealign (w[ 4], w[ 5], offset); + w[62] = amd_bytealign (w[ 3], w[ 4], offset); + w[61] = amd_bytealign (w[ 2], w[ 3], offset); + w[60] = amd_bytealign (w[ 1], w[ 2], offset); + w[59] = amd_bytealign (w[ 0], w[ 1], offset); + w[58] = amd_bytealign ( 0, w[ 0], offset); + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 59: + w[63] = amd_bytealign (w[ 3], w[ 4], offset); + w[62] = amd_bytealign (w[ 2], w[ 3], offset); + w[61] = amd_bytealign (w[ 1], w[ 2], offset); + w[60] = amd_bytealign (w[ 0], w[ 1], offset); + w[59] = amd_bytealign ( 0, w[ 0], offset); + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 60: + w[63] = amd_bytealign (w[ 2], w[ 3], offset); + w[62] = amd_bytealign (w[ 1], w[ 2], offset); + w[61] = amd_bytealign (w[ 0], w[ 1], offset); + w[60] = amd_bytealign ( 0, w[ 0], offset); + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 61: + w[63] = amd_bytealign (w[ 1], w[ 2], offset); + w[62] = amd_bytealign (w[ 0], w[ 1], offset); + w[61] = amd_bytealign ( 0, w[ 0], offset); + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 62: + w[63] = amd_bytealign (w[ 0], w[ 1], offset); + w[62] = amd_bytealign ( 0, w[ 0], offset); + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 63: + w[63] = amd_bytealign ( 0, w[ 0], offset); + w[62] = 0; + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + } + + #pragma unroll + for (int i = 0; i < 64; i++) w[i] = swap32 (w[i]); + + #endif + + #ifdef IS_NV + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + + switch (offset / 4) + { + case 0: + w[63] = __byte_perm (w[62], w[63], selector); + w[62] = __byte_perm (w[61], w[62], selector); + w[61] = __byte_perm (w[60], w[61], selector); + w[60] = __byte_perm (w[59], w[60], selector); + w[59] = __byte_perm (w[58], w[59], selector); + w[58] = __byte_perm (w[57], w[58], selector); + w[57] = __byte_perm (w[56], w[57], selector); + w[56] = __byte_perm (w[55], w[56], selector); + w[55] = __byte_perm (w[54], w[55], selector); + w[54] = __byte_perm (w[53], w[54], selector); + w[53] = __byte_perm (w[52], w[53], selector); + w[52] = __byte_perm (w[51], w[52], selector); + w[51] = __byte_perm (w[50], w[51], selector); + w[50] = __byte_perm (w[49], w[50], selector); + w[49] = __byte_perm (w[48], w[49], selector); + w[48] = __byte_perm (w[47], w[48], selector); + w[47] = __byte_perm (w[46], w[47], selector); + w[46] = __byte_perm (w[45], w[46], selector); + w[45] = __byte_perm (w[44], w[45], selector); + w[44] = __byte_perm (w[43], w[44], selector); + w[43] = __byte_perm (w[42], w[43], selector); + w[42] = __byte_perm (w[41], w[42], selector); + w[41] = __byte_perm (w[40], w[41], selector); + w[40] = __byte_perm (w[39], w[40], selector); + w[39] = __byte_perm (w[38], w[39], selector); + w[38] = __byte_perm (w[37], w[38], selector); + w[37] = __byte_perm (w[36], w[37], selector); + w[36] = __byte_perm (w[35], w[36], selector); + w[35] = __byte_perm (w[34], w[35], selector); + w[34] = __byte_perm (w[33], w[34], selector); + w[33] = __byte_perm (w[32], w[33], selector); + w[32] = __byte_perm (w[31], w[32], selector); + w[31] = __byte_perm (w[30], w[31], selector); + w[30] = __byte_perm (w[29], w[30], selector); + w[29] = __byte_perm (w[28], w[29], selector); + w[28] = __byte_perm (w[27], w[28], selector); + w[27] = __byte_perm (w[26], w[27], selector); + w[26] = __byte_perm (w[25], w[26], selector); + w[25] = __byte_perm (w[24], w[25], selector); + w[24] = __byte_perm (w[23], w[24], selector); + w[23] = __byte_perm (w[22], w[23], selector); + w[22] = __byte_perm (w[21], w[22], selector); + w[21] = __byte_perm (w[20], w[21], selector); + w[20] = __byte_perm (w[19], w[20], selector); + w[19] = __byte_perm (w[18], w[19], selector); + w[18] = __byte_perm (w[17], w[18], selector); + w[17] = __byte_perm (w[16], w[17], selector); + w[16] = __byte_perm (w[15], w[16], selector); + w[15] = __byte_perm (w[14], w[15], selector); + w[14] = __byte_perm (w[13], w[14], selector); + w[13] = __byte_perm (w[12], w[13], selector); + w[12] = __byte_perm (w[11], w[12], selector); + w[11] = __byte_perm (w[10], w[11], selector); + w[10] = __byte_perm (w[ 9], w[10], selector); + w[ 9] = __byte_perm (w[ 8], w[ 9], selector); + w[ 8] = __byte_perm (w[ 7], w[ 8], selector); + w[ 7] = __byte_perm (w[ 6], w[ 7], selector); + w[ 6] = __byte_perm (w[ 5], w[ 6], selector); + w[ 5] = __byte_perm (w[ 4], w[ 5], selector); + w[ 4] = __byte_perm (w[ 3], w[ 4], selector); + w[ 3] = __byte_perm (w[ 2], w[ 3], selector); + w[ 2] = __byte_perm (w[ 1], w[ 2], selector); + w[ 1] = __byte_perm (w[ 0], w[ 1], selector); + w[ 0] = __byte_perm ( 0, w[ 0], selector); + + break; + + case 1: + w[63] = __byte_perm (w[61], w[62], selector); + w[62] = __byte_perm (w[60], w[61], selector); + w[61] = __byte_perm (w[59], w[60], selector); + w[60] = __byte_perm (w[58], w[59], selector); + w[59] = __byte_perm (w[57], w[58], selector); + w[58] = __byte_perm (w[56], w[57], selector); + w[57] = __byte_perm (w[55], w[56], selector); + w[56] = __byte_perm (w[54], w[55], selector); + w[55] = __byte_perm (w[53], w[54], selector); + w[54] = __byte_perm (w[52], w[53], selector); + w[53] = __byte_perm (w[51], w[52], selector); + w[52] = __byte_perm (w[50], w[51], selector); + w[51] = __byte_perm (w[49], w[50], selector); + w[50] = __byte_perm (w[48], w[49], selector); + w[49] = __byte_perm (w[47], w[48], selector); + w[48] = __byte_perm (w[46], w[47], selector); + w[47] = __byte_perm (w[45], w[46], selector); + w[46] = __byte_perm (w[44], w[45], selector); + w[45] = __byte_perm (w[43], w[44], selector); + w[44] = __byte_perm (w[42], w[43], selector); + w[43] = __byte_perm (w[41], w[42], selector); + w[42] = __byte_perm (w[40], w[41], selector); + w[41] = __byte_perm (w[39], w[40], selector); + w[40] = __byte_perm (w[38], w[39], selector); + w[39] = __byte_perm (w[37], w[38], selector); + w[38] = __byte_perm (w[36], w[37], selector); + w[37] = __byte_perm (w[35], w[36], selector); + w[36] = __byte_perm (w[34], w[35], selector); + w[35] = __byte_perm (w[33], w[34], selector); + w[34] = __byte_perm (w[32], w[33], selector); + w[33] = __byte_perm (w[31], w[32], selector); + w[32] = __byte_perm (w[30], w[31], selector); + w[31] = __byte_perm (w[29], w[30], selector); + w[30] = __byte_perm (w[28], w[29], selector); + w[29] = __byte_perm (w[27], w[28], selector); + w[28] = __byte_perm (w[26], w[27], selector); + w[27] = __byte_perm (w[25], w[26], selector); + w[26] = __byte_perm (w[24], w[25], selector); + w[25] = __byte_perm (w[23], w[24], selector); + w[24] = __byte_perm (w[22], w[23], selector); + w[23] = __byte_perm (w[21], w[22], selector); + w[22] = __byte_perm (w[20], w[21], selector); + w[21] = __byte_perm (w[19], w[20], selector); + w[20] = __byte_perm (w[18], w[19], selector); + w[19] = __byte_perm (w[17], w[18], selector); + w[18] = __byte_perm (w[16], w[17], selector); + w[17] = __byte_perm (w[15], w[16], selector); + w[16] = __byte_perm (w[14], w[15], selector); + w[15] = __byte_perm (w[13], w[14], selector); + w[14] = __byte_perm (w[12], w[13], selector); + w[13] = __byte_perm (w[11], w[12], selector); + w[12] = __byte_perm (w[10], w[11], selector); + w[11] = __byte_perm (w[ 9], w[10], selector); + w[10] = __byte_perm (w[ 8], w[ 9], selector); + w[ 9] = __byte_perm (w[ 7], w[ 8], selector); + w[ 8] = __byte_perm (w[ 6], w[ 7], selector); + w[ 7] = __byte_perm (w[ 5], w[ 6], selector); + w[ 6] = __byte_perm (w[ 4], w[ 5], selector); + w[ 5] = __byte_perm (w[ 3], w[ 4], selector); + w[ 4] = __byte_perm (w[ 2], w[ 3], selector); + w[ 3] = __byte_perm (w[ 1], w[ 2], selector); + w[ 2] = __byte_perm (w[ 0], w[ 1], selector); + w[ 1] = __byte_perm ( 0, w[ 0], selector); + w[ 0] = 0; + + break; + + case 2: + w[63] = __byte_perm (w[60], w[61], selector); + w[62] = __byte_perm (w[59], w[60], selector); + w[61] = __byte_perm (w[58], w[59], selector); + w[60] = __byte_perm (w[57], w[58], selector); + w[59] = __byte_perm (w[56], w[57], selector); + w[58] = __byte_perm (w[55], w[56], selector); + w[57] = __byte_perm (w[54], w[55], selector); + w[56] = __byte_perm (w[53], w[54], selector); + w[55] = __byte_perm (w[52], w[53], selector); + w[54] = __byte_perm (w[51], w[52], selector); + w[53] = __byte_perm (w[50], w[51], selector); + w[52] = __byte_perm (w[49], w[50], selector); + w[51] = __byte_perm (w[48], w[49], selector); + w[50] = __byte_perm (w[47], w[48], selector); + w[49] = __byte_perm (w[46], w[47], selector); + w[48] = __byte_perm (w[45], w[46], selector); + w[47] = __byte_perm (w[44], w[45], selector); + w[46] = __byte_perm (w[43], w[44], selector); + w[45] = __byte_perm (w[42], w[43], selector); + w[44] = __byte_perm (w[41], w[42], selector); + w[43] = __byte_perm (w[40], w[41], selector); + w[42] = __byte_perm (w[39], w[40], selector); + w[41] = __byte_perm (w[38], w[39], selector); + w[40] = __byte_perm (w[37], w[38], selector); + w[39] = __byte_perm (w[36], w[37], selector); + w[38] = __byte_perm (w[35], w[36], selector); + w[37] = __byte_perm (w[34], w[35], selector); + w[36] = __byte_perm (w[33], w[34], selector); + w[35] = __byte_perm (w[32], w[33], selector); + w[34] = __byte_perm (w[31], w[32], selector); + w[33] = __byte_perm (w[30], w[31], selector); + w[32] = __byte_perm (w[29], w[30], selector); + w[31] = __byte_perm (w[28], w[29], selector); + w[30] = __byte_perm (w[27], w[28], selector); + w[29] = __byte_perm (w[26], w[27], selector); + w[28] = __byte_perm (w[25], w[26], selector); + w[27] = __byte_perm (w[24], w[25], selector); + w[26] = __byte_perm (w[23], w[24], selector); + w[25] = __byte_perm (w[22], w[23], selector); + w[24] = __byte_perm (w[21], w[22], selector); + w[23] = __byte_perm (w[20], w[21], selector); + w[22] = __byte_perm (w[19], w[20], selector); + w[21] = __byte_perm (w[18], w[19], selector); + w[20] = __byte_perm (w[17], w[18], selector); + w[19] = __byte_perm (w[16], w[17], selector); + w[18] = __byte_perm (w[15], w[16], selector); + w[17] = __byte_perm (w[14], w[15], selector); + w[16] = __byte_perm (w[13], w[14], selector); + w[15] = __byte_perm (w[12], w[13], selector); + w[14] = __byte_perm (w[11], w[12], selector); + w[13] = __byte_perm (w[10], w[11], selector); + w[12] = __byte_perm (w[ 9], w[10], selector); + w[11] = __byte_perm (w[ 8], w[ 9], selector); + w[10] = __byte_perm (w[ 7], w[ 8], selector); + w[ 9] = __byte_perm (w[ 6], w[ 7], selector); + w[ 8] = __byte_perm (w[ 5], w[ 6], selector); + w[ 7] = __byte_perm (w[ 4], w[ 5], selector); + w[ 6] = __byte_perm (w[ 3], w[ 4], selector); + w[ 5] = __byte_perm (w[ 2], w[ 3], selector); + w[ 4] = __byte_perm (w[ 1], w[ 2], selector); + w[ 3] = __byte_perm (w[ 0], w[ 1], selector); + w[ 2] = __byte_perm ( 0, w[ 0], selector); + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 3: + w[63] = __byte_perm (w[59], w[60], selector); + w[62] = __byte_perm (w[58], w[59], selector); + w[61] = __byte_perm (w[57], w[58], selector); + w[60] = __byte_perm (w[56], w[57], selector); + w[59] = __byte_perm (w[55], w[56], selector); + w[58] = __byte_perm (w[54], w[55], selector); + w[57] = __byte_perm (w[53], w[54], selector); + w[56] = __byte_perm (w[52], w[53], selector); + w[55] = __byte_perm (w[51], w[52], selector); + w[54] = __byte_perm (w[50], w[51], selector); + w[53] = __byte_perm (w[49], w[50], selector); + w[52] = __byte_perm (w[48], w[49], selector); + w[51] = __byte_perm (w[47], w[48], selector); + w[50] = __byte_perm (w[46], w[47], selector); + w[49] = __byte_perm (w[45], w[46], selector); + w[48] = __byte_perm (w[44], w[45], selector); + w[47] = __byte_perm (w[43], w[44], selector); + w[46] = __byte_perm (w[42], w[43], selector); + w[45] = __byte_perm (w[41], w[42], selector); + w[44] = __byte_perm (w[40], w[41], selector); + w[43] = __byte_perm (w[39], w[40], selector); + w[42] = __byte_perm (w[38], w[39], selector); + w[41] = __byte_perm (w[37], w[38], selector); + w[40] = __byte_perm (w[36], w[37], selector); + w[39] = __byte_perm (w[35], w[36], selector); + w[38] = __byte_perm (w[34], w[35], selector); + w[37] = __byte_perm (w[33], w[34], selector); + w[36] = __byte_perm (w[32], w[33], selector); + w[35] = __byte_perm (w[31], w[32], selector); + w[34] = __byte_perm (w[30], w[31], selector); + w[33] = __byte_perm (w[29], w[30], selector); + w[32] = __byte_perm (w[28], w[29], selector); + w[31] = __byte_perm (w[27], w[28], selector); + w[30] = __byte_perm (w[26], w[27], selector); + w[29] = __byte_perm (w[25], w[26], selector); + w[28] = __byte_perm (w[24], w[25], selector); + w[27] = __byte_perm (w[23], w[24], selector); + w[26] = __byte_perm (w[22], w[23], selector); + w[25] = __byte_perm (w[21], w[22], selector); + w[24] = __byte_perm (w[20], w[21], selector); + w[23] = __byte_perm (w[19], w[20], selector); + w[22] = __byte_perm (w[18], w[19], selector); + w[21] = __byte_perm (w[17], w[18], selector); + w[20] = __byte_perm (w[16], w[17], selector); + w[19] = __byte_perm (w[15], w[16], selector); + w[18] = __byte_perm (w[14], w[15], selector); + w[17] = __byte_perm (w[13], w[14], selector); + w[16] = __byte_perm (w[12], w[13], selector); + w[15] = __byte_perm (w[11], w[12], selector); + w[14] = __byte_perm (w[10], w[11], selector); + w[13] = __byte_perm (w[ 9], w[10], selector); + w[12] = __byte_perm (w[ 8], w[ 9], selector); + w[11] = __byte_perm (w[ 7], w[ 8], selector); + w[10] = __byte_perm (w[ 6], w[ 7], selector); + w[ 9] = __byte_perm (w[ 5], w[ 6], selector); + w[ 8] = __byte_perm (w[ 4], w[ 5], selector); + w[ 7] = __byte_perm (w[ 3], w[ 4], selector); + w[ 6] = __byte_perm (w[ 2], w[ 3], selector); + w[ 5] = __byte_perm (w[ 1], w[ 2], selector); + w[ 4] = __byte_perm (w[ 0], w[ 1], selector); + w[ 3] = __byte_perm ( 0, w[ 0], selector); + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 4: + w[63] = __byte_perm (w[58], w[59], selector); + w[62] = __byte_perm (w[57], w[58], selector); + w[61] = __byte_perm (w[56], w[57], selector); + w[60] = __byte_perm (w[55], w[56], selector); + w[59] = __byte_perm (w[54], w[55], selector); + w[58] = __byte_perm (w[53], w[54], selector); + w[57] = __byte_perm (w[52], w[53], selector); + w[56] = __byte_perm (w[51], w[52], selector); + w[55] = __byte_perm (w[50], w[51], selector); + w[54] = __byte_perm (w[49], w[50], selector); + w[53] = __byte_perm (w[48], w[49], selector); + w[52] = __byte_perm (w[47], w[48], selector); + w[51] = __byte_perm (w[46], w[47], selector); + w[50] = __byte_perm (w[45], w[46], selector); + w[49] = __byte_perm (w[44], w[45], selector); + w[48] = __byte_perm (w[43], w[44], selector); + w[47] = __byte_perm (w[42], w[43], selector); + w[46] = __byte_perm (w[41], w[42], selector); + w[45] = __byte_perm (w[40], w[41], selector); + w[44] = __byte_perm (w[39], w[40], selector); + w[43] = __byte_perm (w[38], w[39], selector); + w[42] = __byte_perm (w[37], w[38], selector); + w[41] = __byte_perm (w[36], w[37], selector); + w[40] = __byte_perm (w[35], w[36], selector); + w[39] = __byte_perm (w[34], w[35], selector); + w[38] = __byte_perm (w[33], w[34], selector); + w[37] = __byte_perm (w[32], w[33], selector); + w[36] = __byte_perm (w[31], w[32], selector); + w[35] = __byte_perm (w[30], w[31], selector); + w[34] = __byte_perm (w[29], w[30], selector); + w[33] = __byte_perm (w[28], w[29], selector); + w[32] = __byte_perm (w[27], w[28], selector); + w[31] = __byte_perm (w[26], w[27], selector); + w[30] = __byte_perm (w[25], w[26], selector); + w[29] = __byte_perm (w[24], w[25], selector); + w[28] = __byte_perm (w[23], w[24], selector); + w[27] = __byte_perm (w[22], w[23], selector); + w[26] = __byte_perm (w[21], w[22], selector); + w[25] = __byte_perm (w[20], w[21], selector); + w[24] = __byte_perm (w[19], w[20], selector); + w[23] = __byte_perm (w[18], w[19], selector); + w[22] = __byte_perm (w[17], w[18], selector); + w[21] = __byte_perm (w[16], w[17], selector); + w[20] = __byte_perm (w[15], w[16], selector); + w[19] = __byte_perm (w[14], w[15], selector); + w[18] = __byte_perm (w[13], w[14], selector); + w[17] = __byte_perm (w[12], w[13], selector); + w[16] = __byte_perm (w[11], w[12], selector); + w[15] = __byte_perm (w[10], w[11], selector); + w[14] = __byte_perm (w[ 9], w[10], selector); + w[13] = __byte_perm (w[ 8], w[ 9], selector); + w[12] = __byte_perm (w[ 7], w[ 8], selector); + w[11] = __byte_perm (w[ 6], w[ 7], selector); + w[10] = __byte_perm (w[ 5], w[ 6], selector); + w[ 9] = __byte_perm (w[ 4], w[ 5], selector); + w[ 8] = __byte_perm (w[ 3], w[ 4], selector); + w[ 7] = __byte_perm (w[ 2], w[ 3], selector); + w[ 6] = __byte_perm (w[ 1], w[ 2], selector); + w[ 5] = __byte_perm (w[ 0], w[ 1], selector); + w[ 4] = __byte_perm ( 0, w[ 0], selector); + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 5: + w[63] = __byte_perm (w[57], w[58], selector); + w[62] = __byte_perm (w[56], w[57], selector); + w[61] = __byte_perm (w[55], w[56], selector); + w[60] = __byte_perm (w[54], w[55], selector); + w[59] = __byte_perm (w[53], w[54], selector); + w[58] = __byte_perm (w[52], w[53], selector); + w[57] = __byte_perm (w[51], w[52], selector); + w[56] = __byte_perm (w[50], w[51], selector); + w[55] = __byte_perm (w[49], w[50], selector); + w[54] = __byte_perm (w[48], w[49], selector); + w[53] = __byte_perm (w[47], w[48], selector); + w[52] = __byte_perm (w[46], w[47], selector); + w[51] = __byte_perm (w[45], w[46], selector); + w[50] = __byte_perm (w[44], w[45], selector); + w[49] = __byte_perm (w[43], w[44], selector); + w[48] = __byte_perm (w[42], w[43], selector); + w[47] = __byte_perm (w[41], w[42], selector); + w[46] = __byte_perm (w[40], w[41], selector); + w[45] = __byte_perm (w[39], w[40], selector); + w[44] = __byte_perm (w[38], w[39], selector); + w[43] = __byte_perm (w[37], w[38], selector); + w[42] = __byte_perm (w[36], w[37], selector); + w[41] = __byte_perm (w[35], w[36], selector); + w[40] = __byte_perm (w[34], w[35], selector); + w[39] = __byte_perm (w[33], w[34], selector); + w[38] = __byte_perm (w[32], w[33], selector); + w[37] = __byte_perm (w[31], w[32], selector); + w[36] = __byte_perm (w[30], w[31], selector); + w[35] = __byte_perm (w[29], w[30], selector); + w[34] = __byte_perm (w[28], w[29], selector); + w[33] = __byte_perm (w[27], w[28], selector); + w[32] = __byte_perm (w[26], w[27], selector); + w[31] = __byte_perm (w[25], w[26], selector); + w[30] = __byte_perm (w[24], w[25], selector); + w[29] = __byte_perm (w[23], w[24], selector); + w[28] = __byte_perm (w[22], w[23], selector); + w[27] = __byte_perm (w[21], w[22], selector); + w[26] = __byte_perm (w[20], w[21], selector); + w[25] = __byte_perm (w[19], w[20], selector); + w[24] = __byte_perm (w[18], w[19], selector); + w[23] = __byte_perm (w[17], w[18], selector); + w[22] = __byte_perm (w[16], w[17], selector); + w[21] = __byte_perm (w[15], w[16], selector); + w[20] = __byte_perm (w[14], w[15], selector); + w[19] = __byte_perm (w[13], w[14], selector); + w[18] = __byte_perm (w[12], w[13], selector); + w[17] = __byte_perm (w[11], w[12], selector); + w[16] = __byte_perm (w[10], w[11], selector); + w[15] = __byte_perm (w[ 9], w[10], selector); + w[14] = __byte_perm (w[ 8], w[ 9], selector); + w[13] = __byte_perm (w[ 7], w[ 8], selector); + w[12] = __byte_perm (w[ 6], w[ 7], selector); + w[11] = __byte_perm (w[ 5], w[ 6], selector); + w[10] = __byte_perm (w[ 4], w[ 5], selector); + w[ 9] = __byte_perm (w[ 3], w[ 4], selector); + w[ 8] = __byte_perm (w[ 2], w[ 3], selector); + w[ 7] = __byte_perm (w[ 1], w[ 2], selector); + w[ 6] = __byte_perm (w[ 0], w[ 1], selector); + w[ 5] = __byte_perm ( 0, w[ 0], selector); + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 6: + w[63] = __byte_perm (w[56], w[57], selector); + w[62] = __byte_perm (w[55], w[56], selector); + w[61] = __byte_perm (w[54], w[55], selector); + w[60] = __byte_perm (w[53], w[54], selector); + w[59] = __byte_perm (w[52], w[53], selector); + w[58] = __byte_perm (w[51], w[52], selector); + w[57] = __byte_perm (w[50], w[51], selector); + w[56] = __byte_perm (w[49], w[50], selector); + w[55] = __byte_perm (w[48], w[49], selector); + w[54] = __byte_perm (w[47], w[48], selector); + w[53] = __byte_perm (w[46], w[47], selector); + w[52] = __byte_perm (w[45], w[46], selector); + w[51] = __byte_perm (w[44], w[45], selector); + w[50] = __byte_perm (w[43], w[44], selector); + w[49] = __byte_perm (w[42], w[43], selector); + w[48] = __byte_perm (w[41], w[42], selector); + w[47] = __byte_perm (w[40], w[41], selector); + w[46] = __byte_perm (w[39], w[40], selector); + w[45] = __byte_perm (w[38], w[39], selector); + w[44] = __byte_perm (w[37], w[38], selector); + w[43] = __byte_perm (w[36], w[37], selector); + w[42] = __byte_perm (w[35], w[36], selector); + w[41] = __byte_perm (w[34], w[35], selector); + w[40] = __byte_perm (w[33], w[34], selector); + w[39] = __byte_perm (w[32], w[33], selector); + w[38] = __byte_perm (w[31], w[32], selector); + w[37] = __byte_perm (w[30], w[31], selector); + w[36] = __byte_perm (w[29], w[30], selector); + w[35] = __byte_perm (w[28], w[29], selector); + w[34] = __byte_perm (w[27], w[28], selector); + w[33] = __byte_perm (w[26], w[27], selector); + w[32] = __byte_perm (w[25], w[26], selector); + w[31] = __byte_perm (w[24], w[25], selector); + w[30] = __byte_perm (w[23], w[24], selector); + w[29] = __byte_perm (w[22], w[23], selector); + w[28] = __byte_perm (w[21], w[22], selector); + w[27] = __byte_perm (w[20], w[21], selector); + w[26] = __byte_perm (w[19], w[20], selector); + w[25] = __byte_perm (w[18], w[19], selector); + w[24] = __byte_perm (w[17], w[18], selector); + w[23] = __byte_perm (w[16], w[17], selector); + w[22] = __byte_perm (w[15], w[16], selector); + w[21] = __byte_perm (w[14], w[15], selector); + w[20] = __byte_perm (w[13], w[14], selector); + w[19] = __byte_perm (w[12], w[13], selector); + w[18] = __byte_perm (w[11], w[12], selector); + w[17] = __byte_perm (w[10], w[11], selector); + w[16] = __byte_perm (w[ 9], w[10], selector); + w[15] = __byte_perm (w[ 8], w[ 9], selector); + w[14] = __byte_perm (w[ 7], w[ 8], selector); + w[13] = __byte_perm (w[ 6], w[ 7], selector); + w[12] = __byte_perm (w[ 5], w[ 6], selector); + w[11] = __byte_perm (w[ 4], w[ 5], selector); + w[10] = __byte_perm (w[ 3], w[ 4], selector); + w[ 9] = __byte_perm (w[ 2], w[ 3], selector); + w[ 8] = __byte_perm (w[ 1], w[ 2], selector); + w[ 7] = __byte_perm (w[ 0], w[ 1], selector); + w[ 6] = __byte_perm ( 0, w[ 0], selector); + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 7: + w[63] = __byte_perm (w[55], w[56], selector); + w[62] = __byte_perm (w[54], w[55], selector); + w[61] = __byte_perm (w[53], w[54], selector); + w[60] = __byte_perm (w[52], w[53], selector); + w[59] = __byte_perm (w[51], w[52], selector); + w[58] = __byte_perm (w[50], w[51], selector); + w[57] = __byte_perm (w[49], w[50], selector); + w[56] = __byte_perm (w[48], w[49], selector); + w[55] = __byte_perm (w[47], w[48], selector); + w[54] = __byte_perm (w[46], w[47], selector); + w[53] = __byte_perm (w[45], w[46], selector); + w[52] = __byte_perm (w[44], w[45], selector); + w[51] = __byte_perm (w[43], w[44], selector); + w[50] = __byte_perm (w[42], w[43], selector); + w[49] = __byte_perm (w[41], w[42], selector); + w[48] = __byte_perm (w[40], w[41], selector); + w[47] = __byte_perm (w[39], w[40], selector); + w[46] = __byte_perm (w[38], w[39], selector); + w[45] = __byte_perm (w[37], w[38], selector); + w[44] = __byte_perm (w[36], w[37], selector); + w[43] = __byte_perm (w[35], w[36], selector); + w[42] = __byte_perm (w[34], w[35], selector); + w[41] = __byte_perm (w[33], w[34], selector); + w[40] = __byte_perm (w[32], w[33], selector); + w[39] = __byte_perm (w[31], w[32], selector); + w[38] = __byte_perm (w[30], w[31], selector); + w[37] = __byte_perm (w[29], w[30], selector); + w[36] = __byte_perm (w[28], w[29], selector); + w[35] = __byte_perm (w[27], w[28], selector); + w[34] = __byte_perm (w[26], w[27], selector); + w[33] = __byte_perm (w[25], w[26], selector); + w[32] = __byte_perm (w[24], w[25], selector); + w[31] = __byte_perm (w[23], w[24], selector); + w[30] = __byte_perm (w[22], w[23], selector); + w[29] = __byte_perm (w[21], w[22], selector); + w[28] = __byte_perm (w[20], w[21], selector); + w[27] = __byte_perm (w[19], w[20], selector); + w[26] = __byte_perm (w[18], w[19], selector); + w[25] = __byte_perm (w[17], w[18], selector); + w[24] = __byte_perm (w[16], w[17], selector); + w[23] = __byte_perm (w[15], w[16], selector); + w[22] = __byte_perm (w[14], w[15], selector); + w[21] = __byte_perm (w[13], w[14], selector); + w[20] = __byte_perm (w[12], w[13], selector); + w[19] = __byte_perm (w[11], w[12], selector); + w[18] = __byte_perm (w[10], w[11], selector); + w[17] = __byte_perm (w[ 9], w[10], selector); + w[16] = __byte_perm (w[ 8], w[ 9], selector); + w[15] = __byte_perm (w[ 7], w[ 8], selector); + w[14] = __byte_perm (w[ 6], w[ 7], selector); + w[13] = __byte_perm (w[ 5], w[ 6], selector); + w[12] = __byte_perm (w[ 4], w[ 5], selector); + w[11] = __byte_perm (w[ 3], w[ 4], selector); + w[10] = __byte_perm (w[ 2], w[ 3], selector); + w[ 9] = __byte_perm (w[ 1], w[ 2], selector); + w[ 8] = __byte_perm (w[ 0], w[ 1], selector); + w[ 7] = __byte_perm ( 0, w[ 0], selector); + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 8: + w[63] = __byte_perm (w[54], w[55], selector); + w[62] = __byte_perm (w[53], w[54], selector); + w[61] = __byte_perm (w[52], w[53], selector); + w[60] = __byte_perm (w[51], w[52], selector); + w[59] = __byte_perm (w[50], w[51], selector); + w[58] = __byte_perm (w[49], w[50], selector); + w[57] = __byte_perm (w[48], w[49], selector); + w[56] = __byte_perm (w[47], w[48], selector); + w[55] = __byte_perm (w[46], w[47], selector); + w[54] = __byte_perm (w[45], w[46], selector); + w[53] = __byte_perm (w[44], w[45], selector); + w[52] = __byte_perm (w[43], w[44], selector); + w[51] = __byte_perm (w[42], w[43], selector); + w[50] = __byte_perm (w[41], w[42], selector); + w[49] = __byte_perm (w[40], w[41], selector); + w[48] = __byte_perm (w[39], w[40], selector); + w[47] = __byte_perm (w[38], w[39], selector); + w[46] = __byte_perm (w[37], w[38], selector); + w[45] = __byte_perm (w[36], w[37], selector); + w[44] = __byte_perm (w[35], w[36], selector); + w[43] = __byte_perm (w[34], w[35], selector); + w[42] = __byte_perm (w[33], w[34], selector); + w[41] = __byte_perm (w[32], w[33], selector); + w[40] = __byte_perm (w[31], w[32], selector); + w[39] = __byte_perm (w[30], w[31], selector); + w[38] = __byte_perm (w[29], w[30], selector); + w[37] = __byte_perm (w[28], w[29], selector); + w[36] = __byte_perm (w[27], w[28], selector); + w[35] = __byte_perm (w[26], w[27], selector); + w[34] = __byte_perm (w[25], w[26], selector); + w[33] = __byte_perm (w[24], w[25], selector); + w[32] = __byte_perm (w[23], w[24], selector); + w[31] = __byte_perm (w[22], w[23], selector); + w[30] = __byte_perm (w[21], w[22], selector); + w[29] = __byte_perm (w[20], w[21], selector); + w[28] = __byte_perm (w[19], w[20], selector); + w[27] = __byte_perm (w[18], w[19], selector); + w[26] = __byte_perm (w[17], w[18], selector); + w[25] = __byte_perm (w[16], w[17], selector); + w[24] = __byte_perm (w[15], w[16], selector); + w[23] = __byte_perm (w[14], w[15], selector); + w[22] = __byte_perm (w[13], w[14], selector); + w[21] = __byte_perm (w[12], w[13], selector); + w[20] = __byte_perm (w[11], w[12], selector); + w[19] = __byte_perm (w[10], w[11], selector); + w[18] = __byte_perm (w[ 9], w[10], selector); + w[17] = __byte_perm (w[ 8], w[ 9], selector); + w[16] = __byte_perm (w[ 7], w[ 8], selector); + w[15] = __byte_perm (w[ 6], w[ 7], selector); + w[14] = __byte_perm (w[ 5], w[ 6], selector); + w[13] = __byte_perm (w[ 4], w[ 5], selector); + w[12] = __byte_perm (w[ 3], w[ 4], selector); + w[11] = __byte_perm (w[ 2], w[ 3], selector); + w[10] = __byte_perm (w[ 1], w[ 2], selector); + w[ 9] = __byte_perm (w[ 0], w[ 1], selector); + w[ 8] = __byte_perm ( 0, w[ 0], selector); + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 9: + w[63] = __byte_perm (w[53], w[54], selector); + w[62] = __byte_perm (w[52], w[53], selector); + w[61] = __byte_perm (w[51], w[52], selector); + w[60] = __byte_perm (w[50], w[51], selector); + w[59] = __byte_perm (w[49], w[50], selector); + w[58] = __byte_perm (w[48], w[49], selector); + w[57] = __byte_perm (w[47], w[48], selector); + w[56] = __byte_perm (w[46], w[47], selector); + w[55] = __byte_perm (w[45], w[46], selector); + w[54] = __byte_perm (w[44], w[45], selector); + w[53] = __byte_perm (w[43], w[44], selector); + w[52] = __byte_perm (w[42], w[43], selector); + w[51] = __byte_perm (w[41], w[42], selector); + w[50] = __byte_perm (w[40], w[41], selector); + w[49] = __byte_perm (w[39], w[40], selector); + w[48] = __byte_perm (w[38], w[39], selector); + w[47] = __byte_perm (w[37], w[38], selector); + w[46] = __byte_perm (w[36], w[37], selector); + w[45] = __byte_perm (w[35], w[36], selector); + w[44] = __byte_perm (w[34], w[35], selector); + w[43] = __byte_perm (w[33], w[34], selector); + w[42] = __byte_perm (w[32], w[33], selector); + w[41] = __byte_perm (w[31], w[32], selector); + w[40] = __byte_perm (w[30], w[31], selector); + w[39] = __byte_perm (w[29], w[30], selector); + w[38] = __byte_perm (w[28], w[29], selector); + w[37] = __byte_perm (w[27], w[28], selector); + w[36] = __byte_perm (w[26], w[27], selector); + w[35] = __byte_perm (w[25], w[26], selector); + w[34] = __byte_perm (w[24], w[25], selector); + w[33] = __byte_perm (w[23], w[24], selector); + w[32] = __byte_perm (w[22], w[23], selector); + w[31] = __byte_perm (w[21], w[22], selector); + w[30] = __byte_perm (w[20], w[21], selector); + w[29] = __byte_perm (w[19], w[20], selector); + w[28] = __byte_perm (w[18], w[19], selector); + w[27] = __byte_perm (w[17], w[18], selector); + w[26] = __byte_perm (w[16], w[17], selector); + w[25] = __byte_perm (w[15], w[16], selector); + w[24] = __byte_perm (w[14], w[15], selector); + w[23] = __byte_perm (w[13], w[14], selector); + w[22] = __byte_perm (w[12], w[13], selector); + w[21] = __byte_perm (w[11], w[12], selector); + w[20] = __byte_perm (w[10], w[11], selector); + w[19] = __byte_perm (w[ 9], w[10], selector); + w[18] = __byte_perm (w[ 8], w[ 9], selector); + w[17] = __byte_perm (w[ 7], w[ 8], selector); + w[16] = __byte_perm (w[ 6], w[ 7], selector); + w[15] = __byte_perm (w[ 5], w[ 6], selector); + w[14] = __byte_perm (w[ 4], w[ 5], selector); + w[13] = __byte_perm (w[ 3], w[ 4], selector); + w[12] = __byte_perm (w[ 2], w[ 3], selector); + w[11] = __byte_perm (w[ 1], w[ 2], selector); + w[10] = __byte_perm (w[ 0], w[ 1], selector); + w[ 9] = __byte_perm ( 0, w[ 0], selector); + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 10: + w[63] = __byte_perm (w[52], w[53], selector); + w[62] = __byte_perm (w[51], w[52], selector); + w[61] = __byte_perm (w[50], w[51], selector); + w[60] = __byte_perm (w[49], w[50], selector); + w[59] = __byte_perm (w[48], w[49], selector); + w[58] = __byte_perm (w[47], w[48], selector); + w[57] = __byte_perm (w[46], w[47], selector); + w[56] = __byte_perm (w[45], w[46], selector); + w[55] = __byte_perm (w[44], w[45], selector); + w[54] = __byte_perm (w[43], w[44], selector); + w[53] = __byte_perm (w[42], w[43], selector); + w[52] = __byte_perm (w[41], w[42], selector); + w[51] = __byte_perm (w[40], w[41], selector); + w[50] = __byte_perm (w[39], w[40], selector); + w[49] = __byte_perm (w[38], w[39], selector); + w[48] = __byte_perm (w[37], w[38], selector); + w[47] = __byte_perm (w[36], w[37], selector); + w[46] = __byte_perm (w[35], w[36], selector); + w[45] = __byte_perm (w[34], w[35], selector); + w[44] = __byte_perm (w[33], w[34], selector); + w[43] = __byte_perm (w[32], w[33], selector); + w[42] = __byte_perm (w[31], w[32], selector); + w[41] = __byte_perm (w[30], w[31], selector); + w[40] = __byte_perm (w[29], w[30], selector); + w[39] = __byte_perm (w[28], w[29], selector); + w[38] = __byte_perm (w[27], w[28], selector); + w[37] = __byte_perm (w[26], w[27], selector); + w[36] = __byte_perm (w[25], w[26], selector); + w[35] = __byte_perm (w[24], w[25], selector); + w[34] = __byte_perm (w[23], w[24], selector); + w[33] = __byte_perm (w[22], w[23], selector); + w[32] = __byte_perm (w[21], w[22], selector); + w[31] = __byte_perm (w[20], w[21], selector); + w[30] = __byte_perm (w[19], w[20], selector); + w[29] = __byte_perm (w[18], w[19], selector); + w[28] = __byte_perm (w[17], w[18], selector); + w[27] = __byte_perm (w[16], w[17], selector); + w[26] = __byte_perm (w[15], w[16], selector); + w[25] = __byte_perm (w[14], w[15], selector); + w[24] = __byte_perm (w[13], w[14], selector); + w[23] = __byte_perm (w[12], w[13], selector); + w[22] = __byte_perm (w[11], w[12], selector); + w[21] = __byte_perm (w[10], w[11], selector); + w[20] = __byte_perm (w[ 9], w[10], selector); + w[19] = __byte_perm (w[ 8], w[ 9], selector); + w[18] = __byte_perm (w[ 7], w[ 8], selector); + w[17] = __byte_perm (w[ 6], w[ 7], selector); + w[16] = __byte_perm (w[ 5], w[ 6], selector); + w[15] = __byte_perm (w[ 4], w[ 5], selector); + w[14] = __byte_perm (w[ 3], w[ 4], selector); + w[13] = __byte_perm (w[ 2], w[ 3], selector); + w[12] = __byte_perm (w[ 1], w[ 2], selector); + w[11] = __byte_perm (w[ 0], w[ 1], selector); + w[10] = __byte_perm ( 0, w[ 0], selector); + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 11: + w[63] = __byte_perm (w[51], w[52], selector); + w[62] = __byte_perm (w[50], w[51], selector); + w[61] = __byte_perm (w[49], w[50], selector); + w[60] = __byte_perm (w[48], w[49], selector); + w[59] = __byte_perm (w[47], w[48], selector); + w[58] = __byte_perm (w[46], w[47], selector); + w[57] = __byte_perm (w[45], w[46], selector); + w[56] = __byte_perm (w[44], w[45], selector); + w[55] = __byte_perm (w[43], w[44], selector); + w[54] = __byte_perm (w[42], w[43], selector); + w[53] = __byte_perm (w[41], w[42], selector); + w[52] = __byte_perm (w[40], w[41], selector); + w[51] = __byte_perm (w[39], w[40], selector); + w[50] = __byte_perm (w[38], w[39], selector); + w[49] = __byte_perm (w[37], w[38], selector); + w[48] = __byte_perm (w[36], w[37], selector); + w[47] = __byte_perm (w[35], w[36], selector); + w[46] = __byte_perm (w[34], w[35], selector); + w[45] = __byte_perm (w[33], w[34], selector); + w[44] = __byte_perm (w[32], w[33], selector); + w[43] = __byte_perm (w[31], w[32], selector); + w[42] = __byte_perm (w[30], w[31], selector); + w[41] = __byte_perm (w[29], w[30], selector); + w[40] = __byte_perm (w[28], w[29], selector); + w[39] = __byte_perm (w[27], w[28], selector); + w[38] = __byte_perm (w[26], w[27], selector); + w[37] = __byte_perm (w[25], w[26], selector); + w[36] = __byte_perm (w[24], w[25], selector); + w[35] = __byte_perm (w[23], w[24], selector); + w[34] = __byte_perm (w[22], w[23], selector); + w[33] = __byte_perm (w[21], w[22], selector); + w[32] = __byte_perm (w[20], w[21], selector); + w[31] = __byte_perm (w[19], w[20], selector); + w[30] = __byte_perm (w[18], w[19], selector); + w[29] = __byte_perm (w[17], w[18], selector); + w[28] = __byte_perm (w[16], w[17], selector); + w[27] = __byte_perm (w[15], w[16], selector); + w[26] = __byte_perm (w[14], w[15], selector); + w[25] = __byte_perm (w[13], w[14], selector); + w[24] = __byte_perm (w[12], w[13], selector); + w[23] = __byte_perm (w[11], w[12], selector); + w[22] = __byte_perm (w[10], w[11], selector); + w[21] = __byte_perm (w[ 9], w[10], selector); + w[20] = __byte_perm (w[ 8], w[ 9], selector); + w[19] = __byte_perm (w[ 7], w[ 8], selector); + w[18] = __byte_perm (w[ 6], w[ 7], selector); + w[17] = __byte_perm (w[ 5], w[ 6], selector); + w[16] = __byte_perm (w[ 4], w[ 5], selector); + w[15] = __byte_perm (w[ 3], w[ 4], selector); + w[14] = __byte_perm (w[ 2], w[ 3], selector); + w[13] = __byte_perm (w[ 1], w[ 2], selector); + w[12] = __byte_perm (w[ 0], w[ 1], selector); + w[11] = __byte_perm ( 0, w[ 0], selector); + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 12: + w[63] = __byte_perm (w[50], w[51], selector); + w[62] = __byte_perm (w[49], w[50], selector); + w[61] = __byte_perm (w[48], w[49], selector); + w[60] = __byte_perm (w[47], w[48], selector); + w[59] = __byte_perm (w[46], w[47], selector); + w[58] = __byte_perm (w[45], w[46], selector); + w[57] = __byte_perm (w[44], w[45], selector); + w[56] = __byte_perm (w[43], w[44], selector); + w[55] = __byte_perm (w[42], w[43], selector); + w[54] = __byte_perm (w[41], w[42], selector); + w[53] = __byte_perm (w[40], w[41], selector); + w[52] = __byte_perm (w[39], w[40], selector); + w[51] = __byte_perm (w[38], w[39], selector); + w[50] = __byte_perm (w[37], w[38], selector); + w[49] = __byte_perm (w[36], w[37], selector); + w[48] = __byte_perm (w[35], w[36], selector); + w[47] = __byte_perm (w[34], w[35], selector); + w[46] = __byte_perm (w[33], w[34], selector); + w[45] = __byte_perm (w[32], w[33], selector); + w[44] = __byte_perm (w[31], w[32], selector); + w[43] = __byte_perm (w[30], w[31], selector); + w[42] = __byte_perm (w[29], w[30], selector); + w[41] = __byte_perm (w[28], w[29], selector); + w[40] = __byte_perm (w[27], w[28], selector); + w[39] = __byte_perm (w[26], w[27], selector); + w[38] = __byte_perm (w[25], w[26], selector); + w[37] = __byte_perm (w[24], w[25], selector); + w[36] = __byte_perm (w[23], w[24], selector); + w[35] = __byte_perm (w[22], w[23], selector); + w[34] = __byte_perm (w[21], w[22], selector); + w[33] = __byte_perm (w[20], w[21], selector); + w[32] = __byte_perm (w[19], w[20], selector); + w[31] = __byte_perm (w[18], w[19], selector); + w[30] = __byte_perm (w[17], w[18], selector); + w[29] = __byte_perm (w[16], w[17], selector); + w[28] = __byte_perm (w[15], w[16], selector); + w[27] = __byte_perm (w[14], w[15], selector); + w[26] = __byte_perm (w[13], w[14], selector); + w[25] = __byte_perm (w[12], w[13], selector); + w[24] = __byte_perm (w[11], w[12], selector); + w[23] = __byte_perm (w[10], w[11], selector); + w[22] = __byte_perm (w[ 9], w[10], selector); + w[21] = __byte_perm (w[ 8], w[ 9], selector); + w[20] = __byte_perm (w[ 7], w[ 8], selector); + w[19] = __byte_perm (w[ 6], w[ 7], selector); + w[18] = __byte_perm (w[ 5], w[ 6], selector); + w[17] = __byte_perm (w[ 4], w[ 5], selector); + w[16] = __byte_perm (w[ 3], w[ 4], selector); + w[15] = __byte_perm (w[ 2], w[ 3], selector); + w[14] = __byte_perm (w[ 1], w[ 2], selector); + w[13] = __byte_perm (w[ 0], w[ 1], selector); + w[12] = __byte_perm ( 0, w[ 0], selector); + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 13: + w[63] = __byte_perm (w[49], w[50], selector); + w[62] = __byte_perm (w[48], w[49], selector); + w[61] = __byte_perm (w[47], w[48], selector); + w[60] = __byte_perm (w[46], w[47], selector); + w[59] = __byte_perm (w[45], w[46], selector); + w[58] = __byte_perm (w[44], w[45], selector); + w[57] = __byte_perm (w[43], w[44], selector); + w[56] = __byte_perm (w[42], w[43], selector); + w[55] = __byte_perm (w[41], w[42], selector); + w[54] = __byte_perm (w[40], w[41], selector); + w[53] = __byte_perm (w[39], w[40], selector); + w[52] = __byte_perm (w[38], w[39], selector); + w[51] = __byte_perm (w[37], w[38], selector); + w[50] = __byte_perm (w[36], w[37], selector); + w[49] = __byte_perm (w[35], w[36], selector); + w[48] = __byte_perm (w[34], w[35], selector); + w[47] = __byte_perm (w[33], w[34], selector); + w[46] = __byte_perm (w[32], w[33], selector); + w[45] = __byte_perm (w[31], w[32], selector); + w[44] = __byte_perm (w[30], w[31], selector); + w[43] = __byte_perm (w[29], w[30], selector); + w[42] = __byte_perm (w[28], w[29], selector); + w[41] = __byte_perm (w[27], w[28], selector); + w[40] = __byte_perm (w[26], w[27], selector); + w[39] = __byte_perm (w[25], w[26], selector); + w[38] = __byte_perm (w[24], w[25], selector); + w[37] = __byte_perm (w[23], w[24], selector); + w[36] = __byte_perm (w[22], w[23], selector); + w[35] = __byte_perm (w[21], w[22], selector); + w[34] = __byte_perm (w[20], w[21], selector); + w[33] = __byte_perm (w[19], w[20], selector); + w[32] = __byte_perm (w[18], w[19], selector); + w[31] = __byte_perm (w[17], w[18], selector); + w[30] = __byte_perm (w[16], w[17], selector); + w[29] = __byte_perm (w[15], w[16], selector); + w[28] = __byte_perm (w[14], w[15], selector); + w[27] = __byte_perm (w[13], w[14], selector); + w[26] = __byte_perm (w[12], w[13], selector); + w[25] = __byte_perm (w[11], w[12], selector); + w[24] = __byte_perm (w[10], w[11], selector); + w[23] = __byte_perm (w[ 9], w[10], selector); + w[22] = __byte_perm (w[ 8], w[ 9], selector); + w[21] = __byte_perm (w[ 7], w[ 8], selector); + w[20] = __byte_perm (w[ 6], w[ 7], selector); + w[19] = __byte_perm (w[ 5], w[ 6], selector); + w[18] = __byte_perm (w[ 4], w[ 5], selector); + w[17] = __byte_perm (w[ 3], w[ 4], selector); + w[16] = __byte_perm (w[ 2], w[ 3], selector); + w[15] = __byte_perm (w[ 1], w[ 2], selector); + w[14] = __byte_perm (w[ 0], w[ 1], selector); + w[13] = __byte_perm ( 0, w[ 0], selector); + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 14: + w[63] = __byte_perm (w[48], w[49], selector); + w[62] = __byte_perm (w[47], w[48], selector); + w[61] = __byte_perm (w[46], w[47], selector); + w[60] = __byte_perm (w[45], w[46], selector); + w[59] = __byte_perm (w[44], w[45], selector); + w[58] = __byte_perm (w[43], w[44], selector); + w[57] = __byte_perm (w[42], w[43], selector); + w[56] = __byte_perm (w[41], w[42], selector); + w[55] = __byte_perm (w[40], w[41], selector); + w[54] = __byte_perm (w[39], w[40], selector); + w[53] = __byte_perm (w[38], w[39], selector); + w[52] = __byte_perm (w[37], w[38], selector); + w[51] = __byte_perm (w[36], w[37], selector); + w[50] = __byte_perm (w[35], w[36], selector); + w[49] = __byte_perm (w[34], w[35], selector); + w[48] = __byte_perm (w[33], w[34], selector); + w[47] = __byte_perm (w[32], w[33], selector); + w[46] = __byte_perm (w[31], w[32], selector); + w[45] = __byte_perm (w[30], w[31], selector); + w[44] = __byte_perm (w[29], w[30], selector); + w[43] = __byte_perm (w[28], w[29], selector); + w[42] = __byte_perm (w[27], w[28], selector); + w[41] = __byte_perm (w[26], w[27], selector); + w[40] = __byte_perm (w[25], w[26], selector); + w[39] = __byte_perm (w[24], w[25], selector); + w[38] = __byte_perm (w[23], w[24], selector); + w[37] = __byte_perm (w[22], w[23], selector); + w[36] = __byte_perm (w[21], w[22], selector); + w[35] = __byte_perm (w[20], w[21], selector); + w[34] = __byte_perm (w[19], w[20], selector); + w[33] = __byte_perm (w[18], w[19], selector); + w[32] = __byte_perm (w[17], w[18], selector); + w[31] = __byte_perm (w[16], w[17], selector); + w[30] = __byte_perm (w[15], w[16], selector); + w[29] = __byte_perm (w[14], w[15], selector); + w[28] = __byte_perm (w[13], w[14], selector); + w[27] = __byte_perm (w[12], w[13], selector); + w[26] = __byte_perm (w[11], w[12], selector); + w[25] = __byte_perm (w[10], w[11], selector); + w[24] = __byte_perm (w[ 9], w[10], selector); + w[23] = __byte_perm (w[ 8], w[ 9], selector); + w[22] = __byte_perm (w[ 7], w[ 8], selector); + w[21] = __byte_perm (w[ 6], w[ 7], selector); + w[20] = __byte_perm (w[ 5], w[ 6], selector); + w[19] = __byte_perm (w[ 4], w[ 5], selector); + w[18] = __byte_perm (w[ 3], w[ 4], selector); + w[17] = __byte_perm (w[ 2], w[ 3], selector); + w[16] = __byte_perm (w[ 1], w[ 2], selector); + w[15] = __byte_perm (w[ 0], w[ 1], selector); + w[14] = __byte_perm ( 0, w[ 0], selector); + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 15: + w[63] = __byte_perm (w[47], w[48], selector); + w[62] = __byte_perm (w[46], w[47], selector); + w[61] = __byte_perm (w[45], w[46], selector); + w[60] = __byte_perm (w[44], w[45], selector); + w[59] = __byte_perm (w[43], w[44], selector); + w[58] = __byte_perm (w[42], w[43], selector); + w[57] = __byte_perm (w[41], w[42], selector); + w[56] = __byte_perm (w[40], w[41], selector); + w[55] = __byte_perm (w[39], w[40], selector); + w[54] = __byte_perm (w[38], w[39], selector); + w[53] = __byte_perm (w[37], w[38], selector); + w[52] = __byte_perm (w[36], w[37], selector); + w[51] = __byte_perm (w[35], w[36], selector); + w[50] = __byte_perm (w[34], w[35], selector); + w[49] = __byte_perm (w[33], w[34], selector); + w[48] = __byte_perm (w[32], w[33], selector); + w[47] = __byte_perm (w[31], w[32], selector); + w[46] = __byte_perm (w[30], w[31], selector); + w[45] = __byte_perm (w[29], w[30], selector); + w[44] = __byte_perm (w[28], w[29], selector); + w[43] = __byte_perm (w[27], w[28], selector); + w[42] = __byte_perm (w[26], w[27], selector); + w[41] = __byte_perm (w[25], w[26], selector); + w[40] = __byte_perm (w[24], w[25], selector); + w[39] = __byte_perm (w[23], w[24], selector); + w[38] = __byte_perm (w[22], w[23], selector); + w[37] = __byte_perm (w[21], w[22], selector); + w[36] = __byte_perm (w[20], w[21], selector); + w[35] = __byte_perm (w[19], w[20], selector); + w[34] = __byte_perm (w[18], w[19], selector); + w[33] = __byte_perm (w[17], w[18], selector); + w[32] = __byte_perm (w[16], w[17], selector); + w[31] = __byte_perm (w[15], w[16], selector); + w[30] = __byte_perm (w[14], w[15], selector); + w[29] = __byte_perm (w[13], w[14], selector); + w[28] = __byte_perm (w[12], w[13], selector); + w[27] = __byte_perm (w[11], w[12], selector); + w[26] = __byte_perm (w[10], w[11], selector); + w[25] = __byte_perm (w[ 9], w[10], selector); + w[24] = __byte_perm (w[ 8], w[ 9], selector); + w[23] = __byte_perm (w[ 7], w[ 8], selector); + w[22] = __byte_perm (w[ 6], w[ 7], selector); + w[21] = __byte_perm (w[ 5], w[ 6], selector); + w[20] = __byte_perm (w[ 4], w[ 5], selector); + w[19] = __byte_perm (w[ 3], w[ 4], selector); + w[18] = __byte_perm (w[ 2], w[ 3], selector); + w[17] = __byte_perm (w[ 1], w[ 2], selector); + w[16] = __byte_perm (w[ 0], w[ 1], selector); + w[15] = __byte_perm ( 0, w[ 0], selector); + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 16: + w[63] = __byte_perm (w[46], w[47], selector); + w[62] = __byte_perm (w[45], w[46], selector); + w[61] = __byte_perm (w[44], w[45], selector); + w[60] = __byte_perm (w[43], w[44], selector); + w[59] = __byte_perm (w[42], w[43], selector); + w[58] = __byte_perm (w[41], w[42], selector); + w[57] = __byte_perm (w[40], w[41], selector); + w[56] = __byte_perm (w[39], w[40], selector); + w[55] = __byte_perm (w[38], w[39], selector); + w[54] = __byte_perm (w[37], w[38], selector); + w[53] = __byte_perm (w[36], w[37], selector); + w[52] = __byte_perm (w[35], w[36], selector); + w[51] = __byte_perm (w[34], w[35], selector); + w[50] = __byte_perm (w[33], w[34], selector); + w[49] = __byte_perm (w[32], w[33], selector); + w[48] = __byte_perm (w[31], w[32], selector); + w[47] = __byte_perm (w[30], w[31], selector); + w[46] = __byte_perm (w[29], w[30], selector); + w[45] = __byte_perm (w[28], w[29], selector); + w[44] = __byte_perm (w[27], w[28], selector); + w[43] = __byte_perm (w[26], w[27], selector); + w[42] = __byte_perm (w[25], w[26], selector); + w[41] = __byte_perm (w[24], w[25], selector); + w[40] = __byte_perm (w[23], w[24], selector); + w[39] = __byte_perm (w[22], w[23], selector); + w[38] = __byte_perm (w[21], w[22], selector); + w[37] = __byte_perm (w[20], w[21], selector); + w[36] = __byte_perm (w[19], w[20], selector); + w[35] = __byte_perm (w[18], w[19], selector); + w[34] = __byte_perm (w[17], w[18], selector); + w[33] = __byte_perm (w[16], w[17], selector); + w[32] = __byte_perm (w[15], w[16], selector); + w[31] = __byte_perm (w[14], w[15], selector); + w[30] = __byte_perm (w[13], w[14], selector); + w[29] = __byte_perm (w[12], w[13], selector); + w[28] = __byte_perm (w[11], w[12], selector); + w[27] = __byte_perm (w[10], w[11], selector); + w[26] = __byte_perm (w[ 9], w[10], selector); + w[25] = __byte_perm (w[ 8], w[ 9], selector); + w[24] = __byte_perm (w[ 7], w[ 8], selector); + w[23] = __byte_perm (w[ 6], w[ 7], selector); + w[22] = __byte_perm (w[ 5], w[ 6], selector); + w[21] = __byte_perm (w[ 4], w[ 5], selector); + w[20] = __byte_perm (w[ 3], w[ 4], selector); + w[19] = __byte_perm (w[ 2], w[ 3], selector); + w[18] = __byte_perm (w[ 1], w[ 2], selector); + w[17] = __byte_perm (w[ 0], w[ 1], selector); + w[16] = __byte_perm ( 0, w[ 0], selector); + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 17: + w[63] = __byte_perm (w[45], w[46], selector); + w[62] = __byte_perm (w[44], w[45], selector); + w[61] = __byte_perm (w[43], w[44], selector); + w[60] = __byte_perm (w[42], w[43], selector); + w[59] = __byte_perm (w[41], w[42], selector); + w[58] = __byte_perm (w[40], w[41], selector); + w[57] = __byte_perm (w[39], w[40], selector); + w[56] = __byte_perm (w[38], w[39], selector); + w[55] = __byte_perm (w[37], w[38], selector); + w[54] = __byte_perm (w[36], w[37], selector); + w[53] = __byte_perm (w[35], w[36], selector); + w[52] = __byte_perm (w[34], w[35], selector); + w[51] = __byte_perm (w[33], w[34], selector); + w[50] = __byte_perm (w[32], w[33], selector); + w[49] = __byte_perm (w[31], w[32], selector); + w[48] = __byte_perm (w[30], w[31], selector); + w[47] = __byte_perm (w[29], w[30], selector); + w[46] = __byte_perm (w[28], w[29], selector); + w[45] = __byte_perm (w[27], w[28], selector); + w[44] = __byte_perm (w[26], w[27], selector); + w[43] = __byte_perm (w[25], w[26], selector); + w[42] = __byte_perm (w[24], w[25], selector); + w[41] = __byte_perm (w[23], w[24], selector); + w[40] = __byte_perm (w[22], w[23], selector); + w[39] = __byte_perm (w[21], w[22], selector); + w[38] = __byte_perm (w[20], w[21], selector); + w[37] = __byte_perm (w[19], w[20], selector); + w[36] = __byte_perm (w[18], w[19], selector); + w[35] = __byte_perm (w[17], w[18], selector); + w[34] = __byte_perm (w[16], w[17], selector); + w[33] = __byte_perm (w[15], w[16], selector); + w[32] = __byte_perm (w[14], w[15], selector); + w[31] = __byte_perm (w[13], w[14], selector); + w[30] = __byte_perm (w[12], w[13], selector); + w[29] = __byte_perm (w[11], w[12], selector); + w[28] = __byte_perm (w[10], w[11], selector); + w[27] = __byte_perm (w[ 9], w[10], selector); + w[26] = __byte_perm (w[ 8], w[ 9], selector); + w[25] = __byte_perm (w[ 7], w[ 8], selector); + w[24] = __byte_perm (w[ 6], w[ 7], selector); + w[23] = __byte_perm (w[ 5], w[ 6], selector); + w[22] = __byte_perm (w[ 4], w[ 5], selector); + w[21] = __byte_perm (w[ 3], w[ 4], selector); + w[20] = __byte_perm (w[ 2], w[ 3], selector); + w[19] = __byte_perm (w[ 1], w[ 2], selector); + w[18] = __byte_perm (w[ 0], w[ 1], selector); + w[17] = __byte_perm ( 0, w[ 0], selector); + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 18: + w[63] = __byte_perm (w[44], w[45], selector); + w[62] = __byte_perm (w[43], w[44], selector); + w[61] = __byte_perm (w[42], w[43], selector); + w[60] = __byte_perm (w[41], w[42], selector); + w[59] = __byte_perm (w[40], w[41], selector); + w[58] = __byte_perm (w[39], w[40], selector); + w[57] = __byte_perm (w[38], w[39], selector); + w[56] = __byte_perm (w[37], w[38], selector); + w[55] = __byte_perm (w[36], w[37], selector); + w[54] = __byte_perm (w[35], w[36], selector); + w[53] = __byte_perm (w[34], w[35], selector); + w[52] = __byte_perm (w[33], w[34], selector); + w[51] = __byte_perm (w[32], w[33], selector); + w[50] = __byte_perm (w[31], w[32], selector); + w[49] = __byte_perm (w[30], w[31], selector); + w[48] = __byte_perm (w[29], w[30], selector); + w[47] = __byte_perm (w[28], w[29], selector); + w[46] = __byte_perm (w[27], w[28], selector); + w[45] = __byte_perm (w[26], w[27], selector); + w[44] = __byte_perm (w[25], w[26], selector); + w[43] = __byte_perm (w[24], w[25], selector); + w[42] = __byte_perm (w[23], w[24], selector); + w[41] = __byte_perm (w[22], w[23], selector); + w[40] = __byte_perm (w[21], w[22], selector); + w[39] = __byte_perm (w[20], w[21], selector); + w[38] = __byte_perm (w[19], w[20], selector); + w[37] = __byte_perm (w[18], w[19], selector); + w[36] = __byte_perm (w[17], w[18], selector); + w[35] = __byte_perm (w[16], w[17], selector); + w[34] = __byte_perm (w[15], w[16], selector); + w[33] = __byte_perm (w[14], w[15], selector); + w[32] = __byte_perm (w[13], w[14], selector); + w[31] = __byte_perm (w[12], w[13], selector); + w[30] = __byte_perm (w[11], w[12], selector); + w[29] = __byte_perm (w[10], w[11], selector); + w[28] = __byte_perm (w[ 9], w[10], selector); + w[27] = __byte_perm (w[ 8], w[ 9], selector); + w[26] = __byte_perm (w[ 7], w[ 8], selector); + w[25] = __byte_perm (w[ 6], w[ 7], selector); + w[24] = __byte_perm (w[ 5], w[ 6], selector); + w[23] = __byte_perm (w[ 4], w[ 5], selector); + w[22] = __byte_perm (w[ 3], w[ 4], selector); + w[21] = __byte_perm (w[ 2], w[ 3], selector); + w[20] = __byte_perm (w[ 1], w[ 2], selector); + w[19] = __byte_perm (w[ 0], w[ 1], selector); + w[18] = __byte_perm ( 0, w[ 0], selector); + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 19: + w[63] = __byte_perm (w[43], w[44], selector); + w[62] = __byte_perm (w[42], w[43], selector); + w[61] = __byte_perm (w[41], w[42], selector); + w[60] = __byte_perm (w[40], w[41], selector); + w[59] = __byte_perm (w[39], w[40], selector); + w[58] = __byte_perm (w[38], w[39], selector); + w[57] = __byte_perm (w[37], w[38], selector); + w[56] = __byte_perm (w[36], w[37], selector); + w[55] = __byte_perm (w[35], w[36], selector); + w[54] = __byte_perm (w[34], w[35], selector); + w[53] = __byte_perm (w[33], w[34], selector); + w[52] = __byte_perm (w[32], w[33], selector); + w[51] = __byte_perm (w[31], w[32], selector); + w[50] = __byte_perm (w[30], w[31], selector); + w[49] = __byte_perm (w[29], w[30], selector); + w[48] = __byte_perm (w[28], w[29], selector); + w[47] = __byte_perm (w[27], w[28], selector); + w[46] = __byte_perm (w[26], w[27], selector); + w[45] = __byte_perm (w[25], w[26], selector); + w[44] = __byte_perm (w[24], w[25], selector); + w[43] = __byte_perm (w[23], w[24], selector); + w[42] = __byte_perm (w[22], w[23], selector); + w[41] = __byte_perm (w[21], w[22], selector); + w[40] = __byte_perm (w[20], w[21], selector); + w[39] = __byte_perm (w[19], w[20], selector); + w[38] = __byte_perm (w[18], w[19], selector); + w[37] = __byte_perm (w[17], w[18], selector); + w[36] = __byte_perm (w[16], w[17], selector); + w[35] = __byte_perm (w[15], w[16], selector); + w[34] = __byte_perm (w[14], w[15], selector); + w[33] = __byte_perm (w[13], w[14], selector); + w[32] = __byte_perm (w[12], w[13], selector); + w[31] = __byte_perm (w[11], w[12], selector); + w[30] = __byte_perm (w[10], w[11], selector); + w[29] = __byte_perm (w[ 9], w[10], selector); + w[28] = __byte_perm (w[ 8], w[ 9], selector); + w[27] = __byte_perm (w[ 7], w[ 8], selector); + w[26] = __byte_perm (w[ 6], w[ 7], selector); + w[25] = __byte_perm (w[ 5], w[ 6], selector); + w[24] = __byte_perm (w[ 4], w[ 5], selector); + w[23] = __byte_perm (w[ 3], w[ 4], selector); + w[22] = __byte_perm (w[ 2], w[ 3], selector); + w[21] = __byte_perm (w[ 1], w[ 2], selector); + w[20] = __byte_perm (w[ 0], w[ 1], selector); + w[19] = __byte_perm ( 0, w[ 0], selector); + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 20: + w[63] = __byte_perm (w[42], w[43], selector); + w[62] = __byte_perm (w[41], w[42], selector); + w[61] = __byte_perm (w[40], w[41], selector); + w[60] = __byte_perm (w[39], w[40], selector); + w[59] = __byte_perm (w[38], w[39], selector); + w[58] = __byte_perm (w[37], w[38], selector); + w[57] = __byte_perm (w[36], w[37], selector); + w[56] = __byte_perm (w[35], w[36], selector); + w[55] = __byte_perm (w[34], w[35], selector); + w[54] = __byte_perm (w[33], w[34], selector); + w[53] = __byte_perm (w[32], w[33], selector); + w[52] = __byte_perm (w[31], w[32], selector); + w[51] = __byte_perm (w[30], w[31], selector); + w[50] = __byte_perm (w[29], w[30], selector); + w[49] = __byte_perm (w[28], w[29], selector); + w[48] = __byte_perm (w[27], w[28], selector); + w[47] = __byte_perm (w[26], w[27], selector); + w[46] = __byte_perm (w[25], w[26], selector); + w[45] = __byte_perm (w[24], w[25], selector); + w[44] = __byte_perm (w[23], w[24], selector); + w[43] = __byte_perm (w[22], w[23], selector); + w[42] = __byte_perm (w[21], w[22], selector); + w[41] = __byte_perm (w[20], w[21], selector); + w[40] = __byte_perm (w[19], w[20], selector); + w[39] = __byte_perm (w[18], w[19], selector); + w[38] = __byte_perm (w[17], w[18], selector); + w[37] = __byte_perm (w[16], w[17], selector); + w[36] = __byte_perm (w[15], w[16], selector); + w[35] = __byte_perm (w[14], w[15], selector); + w[34] = __byte_perm (w[13], w[14], selector); + w[33] = __byte_perm (w[12], w[13], selector); + w[32] = __byte_perm (w[11], w[12], selector); + w[31] = __byte_perm (w[10], w[11], selector); + w[30] = __byte_perm (w[ 9], w[10], selector); + w[29] = __byte_perm (w[ 8], w[ 9], selector); + w[28] = __byte_perm (w[ 7], w[ 8], selector); + w[27] = __byte_perm (w[ 6], w[ 7], selector); + w[26] = __byte_perm (w[ 5], w[ 6], selector); + w[25] = __byte_perm (w[ 4], w[ 5], selector); + w[24] = __byte_perm (w[ 3], w[ 4], selector); + w[23] = __byte_perm (w[ 2], w[ 3], selector); + w[22] = __byte_perm (w[ 1], w[ 2], selector); + w[21] = __byte_perm (w[ 0], w[ 1], selector); + w[20] = __byte_perm ( 0, w[ 0], selector); + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 21: + w[63] = __byte_perm (w[41], w[42], selector); + w[62] = __byte_perm (w[40], w[41], selector); + w[61] = __byte_perm (w[39], w[40], selector); + w[60] = __byte_perm (w[38], w[39], selector); + w[59] = __byte_perm (w[37], w[38], selector); + w[58] = __byte_perm (w[36], w[37], selector); + w[57] = __byte_perm (w[35], w[36], selector); + w[56] = __byte_perm (w[34], w[35], selector); + w[55] = __byte_perm (w[33], w[34], selector); + w[54] = __byte_perm (w[32], w[33], selector); + w[53] = __byte_perm (w[31], w[32], selector); + w[52] = __byte_perm (w[30], w[31], selector); + w[51] = __byte_perm (w[29], w[30], selector); + w[50] = __byte_perm (w[28], w[29], selector); + w[49] = __byte_perm (w[27], w[28], selector); + w[48] = __byte_perm (w[26], w[27], selector); + w[47] = __byte_perm (w[25], w[26], selector); + w[46] = __byte_perm (w[24], w[25], selector); + w[45] = __byte_perm (w[23], w[24], selector); + w[44] = __byte_perm (w[22], w[23], selector); + w[43] = __byte_perm (w[21], w[22], selector); + w[42] = __byte_perm (w[20], w[21], selector); + w[41] = __byte_perm (w[19], w[20], selector); + w[40] = __byte_perm (w[18], w[19], selector); + w[39] = __byte_perm (w[17], w[18], selector); + w[38] = __byte_perm (w[16], w[17], selector); + w[37] = __byte_perm (w[15], w[16], selector); + w[36] = __byte_perm (w[14], w[15], selector); + w[35] = __byte_perm (w[13], w[14], selector); + w[34] = __byte_perm (w[12], w[13], selector); + w[33] = __byte_perm (w[11], w[12], selector); + w[32] = __byte_perm (w[10], w[11], selector); + w[31] = __byte_perm (w[ 9], w[10], selector); + w[30] = __byte_perm (w[ 8], w[ 9], selector); + w[29] = __byte_perm (w[ 7], w[ 8], selector); + w[28] = __byte_perm (w[ 6], w[ 7], selector); + w[27] = __byte_perm (w[ 5], w[ 6], selector); + w[26] = __byte_perm (w[ 4], w[ 5], selector); + w[25] = __byte_perm (w[ 3], w[ 4], selector); + w[24] = __byte_perm (w[ 2], w[ 3], selector); + w[23] = __byte_perm (w[ 1], w[ 2], selector); + w[22] = __byte_perm (w[ 0], w[ 1], selector); + w[21] = __byte_perm ( 0, w[ 0], selector); + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 22: + w[63] = __byte_perm (w[40], w[41], selector); + w[62] = __byte_perm (w[39], w[40], selector); + w[61] = __byte_perm (w[38], w[39], selector); + w[60] = __byte_perm (w[37], w[38], selector); + w[59] = __byte_perm (w[36], w[37], selector); + w[58] = __byte_perm (w[35], w[36], selector); + w[57] = __byte_perm (w[34], w[35], selector); + w[56] = __byte_perm (w[33], w[34], selector); + w[55] = __byte_perm (w[32], w[33], selector); + w[54] = __byte_perm (w[31], w[32], selector); + w[53] = __byte_perm (w[30], w[31], selector); + w[52] = __byte_perm (w[29], w[30], selector); + w[51] = __byte_perm (w[28], w[29], selector); + w[50] = __byte_perm (w[27], w[28], selector); + w[49] = __byte_perm (w[26], w[27], selector); + w[48] = __byte_perm (w[25], w[26], selector); + w[47] = __byte_perm (w[24], w[25], selector); + w[46] = __byte_perm (w[23], w[24], selector); + w[45] = __byte_perm (w[22], w[23], selector); + w[44] = __byte_perm (w[21], w[22], selector); + w[43] = __byte_perm (w[20], w[21], selector); + w[42] = __byte_perm (w[19], w[20], selector); + w[41] = __byte_perm (w[18], w[19], selector); + w[40] = __byte_perm (w[17], w[18], selector); + w[39] = __byte_perm (w[16], w[17], selector); + w[38] = __byte_perm (w[15], w[16], selector); + w[37] = __byte_perm (w[14], w[15], selector); + w[36] = __byte_perm (w[13], w[14], selector); + w[35] = __byte_perm (w[12], w[13], selector); + w[34] = __byte_perm (w[11], w[12], selector); + w[33] = __byte_perm (w[10], w[11], selector); + w[32] = __byte_perm (w[ 9], w[10], selector); + w[31] = __byte_perm (w[ 8], w[ 9], selector); + w[30] = __byte_perm (w[ 7], w[ 8], selector); + w[29] = __byte_perm (w[ 6], w[ 7], selector); + w[28] = __byte_perm (w[ 5], w[ 6], selector); + w[27] = __byte_perm (w[ 4], w[ 5], selector); + w[26] = __byte_perm (w[ 3], w[ 4], selector); + w[25] = __byte_perm (w[ 2], w[ 3], selector); + w[24] = __byte_perm (w[ 1], w[ 2], selector); + w[23] = __byte_perm (w[ 0], w[ 1], selector); + w[22] = __byte_perm ( 0, w[ 0], selector); + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 23: + w[63] = __byte_perm (w[39], w[40], selector); + w[62] = __byte_perm (w[38], w[39], selector); + w[61] = __byte_perm (w[37], w[38], selector); + w[60] = __byte_perm (w[36], w[37], selector); + w[59] = __byte_perm (w[35], w[36], selector); + w[58] = __byte_perm (w[34], w[35], selector); + w[57] = __byte_perm (w[33], w[34], selector); + w[56] = __byte_perm (w[32], w[33], selector); + w[55] = __byte_perm (w[31], w[32], selector); + w[54] = __byte_perm (w[30], w[31], selector); + w[53] = __byte_perm (w[29], w[30], selector); + w[52] = __byte_perm (w[28], w[29], selector); + w[51] = __byte_perm (w[27], w[28], selector); + w[50] = __byte_perm (w[26], w[27], selector); + w[49] = __byte_perm (w[25], w[26], selector); + w[48] = __byte_perm (w[24], w[25], selector); + w[47] = __byte_perm (w[23], w[24], selector); + w[46] = __byte_perm (w[22], w[23], selector); + w[45] = __byte_perm (w[21], w[22], selector); + w[44] = __byte_perm (w[20], w[21], selector); + w[43] = __byte_perm (w[19], w[20], selector); + w[42] = __byte_perm (w[18], w[19], selector); + w[41] = __byte_perm (w[17], w[18], selector); + w[40] = __byte_perm (w[16], w[17], selector); + w[39] = __byte_perm (w[15], w[16], selector); + w[38] = __byte_perm (w[14], w[15], selector); + w[37] = __byte_perm (w[13], w[14], selector); + w[36] = __byte_perm (w[12], w[13], selector); + w[35] = __byte_perm (w[11], w[12], selector); + w[34] = __byte_perm (w[10], w[11], selector); + w[33] = __byte_perm (w[ 9], w[10], selector); + w[32] = __byte_perm (w[ 8], w[ 9], selector); + w[31] = __byte_perm (w[ 7], w[ 8], selector); + w[30] = __byte_perm (w[ 6], w[ 7], selector); + w[29] = __byte_perm (w[ 5], w[ 6], selector); + w[28] = __byte_perm (w[ 4], w[ 5], selector); + w[27] = __byte_perm (w[ 3], w[ 4], selector); + w[26] = __byte_perm (w[ 2], w[ 3], selector); + w[25] = __byte_perm (w[ 1], w[ 2], selector); + w[24] = __byte_perm (w[ 0], w[ 1], selector); + w[23] = __byte_perm ( 0, w[ 0], selector); + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 24: + w[63] = __byte_perm (w[38], w[39], selector); + w[62] = __byte_perm (w[37], w[38], selector); + w[61] = __byte_perm (w[36], w[37], selector); + w[60] = __byte_perm (w[35], w[36], selector); + w[59] = __byte_perm (w[34], w[35], selector); + w[58] = __byte_perm (w[33], w[34], selector); + w[57] = __byte_perm (w[32], w[33], selector); + w[56] = __byte_perm (w[31], w[32], selector); + w[55] = __byte_perm (w[30], w[31], selector); + w[54] = __byte_perm (w[29], w[30], selector); + w[53] = __byte_perm (w[28], w[29], selector); + w[52] = __byte_perm (w[27], w[28], selector); + w[51] = __byte_perm (w[26], w[27], selector); + w[50] = __byte_perm (w[25], w[26], selector); + w[49] = __byte_perm (w[24], w[25], selector); + w[48] = __byte_perm (w[23], w[24], selector); + w[47] = __byte_perm (w[22], w[23], selector); + w[46] = __byte_perm (w[21], w[22], selector); + w[45] = __byte_perm (w[20], w[21], selector); + w[44] = __byte_perm (w[19], w[20], selector); + w[43] = __byte_perm (w[18], w[19], selector); + w[42] = __byte_perm (w[17], w[18], selector); + w[41] = __byte_perm (w[16], w[17], selector); + w[40] = __byte_perm (w[15], w[16], selector); + w[39] = __byte_perm (w[14], w[15], selector); + w[38] = __byte_perm (w[13], w[14], selector); + w[37] = __byte_perm (w[12], w[13], selector); + w[36] = __byte_perm (w[11], w[12], selector); + w[35] = __byte_perm (w[10], w[11], selector); + w[34] = __byte_perm (w[ 9], w[10], selector); + w[33] = __byte_perm (w[ 8], w[ 9], selector); + w[32] = __byte_perm (w[ 7], w[ 8], selector); + w[31] = __byte_perm (w[ 6], w[ 7], selector); + w[30] = __byte_perm (w[ 5], w[ 6], selector); + w[29] = __byte_perm (w[ 4], w[ 5], selector); + w[28] = __byte_perm (w[ 3], w[ 4], selector); + w[27] = __byte_perm (w[ 2], w[ 3], selector); + w[26] = __byte_perm (w[ 1], w[ 2], selector); + w[25] = __byte_perm (w[ 0], w[ 1], selector); + w[24] = __byte_perm ( 0, w[ 0], selector); + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 25: + w[63] = __byte_perm (w[37], w[38], selector); + w[62] = __byte_perm (w[36], w[37], selector); + w[61] = __byte_perm (w[35], w[36], selector); + w[60] = __byte_perm (w[34], w[35], selector); + w[59] = __byte_perm (w[33], w[34], selector); + w[58] = __byte_perm (w[32], w[33], selector); + w[57] = __byte_perm (w[31], w[32], selector); + w[56] = __byte_perm (w[30], w[31], selector); + w[55] = __byte_perm (w[29], w[30], selector); + w[54] = __byte_perm (w[28], w[29], selector); + w[53] = __byte_perm (w[27], w[28], selector); + w[52] = __byte_perm (w[26], w[27], selector); + w[51] = __byte_perm (w[25], w[26], selector); + w[50] = __byte_perm (w[24], w[25], selector); + w[49] = __byte_perm (w[23], w[24], selector); + w[48] = __byte_perm (w[22], w[23], selector); + w[47] = __byte_perm (w[21], w[22], selector); + w[46] = __byte_perm (w[20], w[21], selector); + w[45] = __byte_perm (w[19], w[20], selector); + w[44] = __byte_perm (w[18], w[19], selector); + w[43] = __byte_perm (w[17], w[18], selector); + w[42] = __byte_perm (w[16], w[17], selector); + w[41] = __byte_perm (w[15], w[16], selector); + w[40] = __byte_perm (w[14], w[15], selector); + w[39] = __byte_perm (w[13], w[14], selector); + w[38] = __byte_perm (w[12], w[13], selector); + w[37] = __byte_perm (w[11], w[12], selector); + w[36] = __byte_perm (w[10], w[11], selector); + w[35] = __byte_perm (w[ 9], w[10], selector); + w[34] = __byte_perm (w[ 8], w[ 9], selector); + w[33] = __byte_perm (w[ 7], w[ 8], selector); + w[32] = __byte_perm (w[ 6], w[ 7], selector); + w[31] = __byte_perm (w[ 5], w[ 6], selector); + w[30] = __byte_perm (w[ 4], w[ 5], selector); + w[29] = __byte_perm (w[ 3], w[ 4], selector); + w[28] = __byte_perm (w[ 2], w[ 3], selector); + w[27] = __byte_perm (w[ 1], w[ 2], selector); + w[26] = __byte_perm (w[ 0], w[ 1], selector); + w[25] = __byte_perm ( 0, w[ 0], selector); + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 26: + w[63] = __byte_perm (w[36], w[37], selector); + w[62] = __byte_perm (w[35], w[36], selector); + w[61] = __byte_perm (w[34], w[35], selector); + w[60] = __byte_perm (w[33], w[34], selector); + w[59] = __byte_perm (w[32], w[33], selector); + w[58] = __byte_perm (w[31], w[32], selector); + w[57] = __byte_perm (w[30], w[31], selector); + w[56] = __byte_perm (w[29], w[30], selector); + w[55] = __byte_perm (w[28], w[29], selector); + w[54] = __byte_perm (w[27], w[28], selector); + w[53] = __byte_perm (w[26], w[27], selector); + w[52] = __byte_perm (w[25], w[26], selector); + w[51] = __byte_perm (w[24], w[25], selector); + w[50] = __byte_perm (w[23], w[24], selector); + w[49] = __byte_perm (w[22], w[23], selector); + w[48] = __byte_perm (w[21], w[22], selector); + w[47] = __byte_perm (w[20], w[21], selector); + w[46] = __byte_perm (w[19], w[20], selector); + w[45] = __byte_perm (w[18], w[19], selector); + w[44] = __byte_perm (w[17], w[18], selector); + w[43] = __byte_perm (w[16], w[17], selector); + w[42] = __byte_perm (w[15], w[16], selector); + w[41] = __byte_perm (w[14], w[15], selector); + w[40] = __byte_perm (w[13], w[14], selector); + w[39] = __byte_perm (w[12], w[13], selector); + w[38] = __byte_perm (w[11], w[12], selector); + w[37] = __byte_perm (w[10], w[11], selector); + w[36] = __byte_perm (w[ 9], w[10], selector); + w[35] = __byte_perm (w[ 8], w[ 9], selector); + w[34] = __byte_perm (w[ 7], w[ 8], selector); + w[33] = __byte_perm (w[ 6], w[ 7], selector); + w[32] = __byte_perm (w[ 5], w[ 6], selector); + w[31] = __byte_perm (w[ 4], w[ 5], selector); + w[30] = __byte_perm (w[ 3], w[ 4], selector); + w[29] = __byte_perm (w[ 2], w[ 3], selector); + w[28] = __byte_perm (w[ 1], w[ 2], selector); + w[27] = __byte_perm (w[ 0], w[ 1], selector); + w[26] = __byte_perm ( 0, w[ 0], selector); + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 27: + w[63] = __byte_perm (w[35], w[36], selector); + w[62] = __byte_perm (w[34], w[35], selector); + w[61] = __byte_perm (w[33], w[34], selector); + w[60] = __byte_perm (w[32], w[33], selector); + w[59] = __byte_perm (w[31], w[32], selector); + w[58] = __byte_perm (w[30], w[31], selector); + w[57] = __byte_perm (w[29], w[30], selector); + w[56] = __byte_perm (w[28], w[29], selector); + w[55] = __byte_perm (w[27], w[28], selector); + w[54] = __byte_perm (w[26], w[27], selector); + w[53] = __byte_perm (w[25], w[26], selector); + w[52] = __byte_perm (w[24], w[25], selector); + w[51] = __byte_perm (w[23], w[24], selector); + w[50] = __byte_perm (w[22], w[23], selector); + w[49] = __byte_perm (w[21], w[22], selector); + w[48] = __byte_perm (w[20], w[21], selector); + w[47] = __byte_perm (w[19], w[20], selector); + w[46] = __byte_perm (w[18], w[19], selector); + w[45] = __byte_perm (w[17], w[18], selector); + w[44] = __byte_perm (w[16], w[17], selector); + w[43] = __byte_perm (w[15], w[16], selector); + w[42] = __byte_perm (w[14], w[15], selector); + w[41] = __byte_perm (w[13], w[14], selector); + w[40] = __byte_perm (w[12], w[13], selector); + w[39] = __byte_perm (w[11], w[12], selector); + w[38] = __byte_perm (w[10], w[11], selector); + w[37] = __byte_perm (w[ 9], w[10], selector); + w[36] = __byte_perm (w[ 8], w[ 9], selector); + w[35] = __byte_perm (w[ 7], w[ 8], selector); + w[34] = __byte_perm (w[ 6], w[ 7], selector); + w[33] = __byte_perm (w[ 5], w[ 6], selector); + w[32] = __byte_perm (w[ 4], w[ 5], selector); + w[31] = __byte_perm (w[ 3], w[ 4], selector); + w[30] = __byte_perm (w[ 2], w[ 3], selector); + w[29] = __byte_perm (w[ 1], w[ 2], selector); + w[28] = __byte_perm (w[ 0], w[ 1], selector); + w[27] = __byte_perm ( 0, w[ 0], selector); + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 28: + w[63] = __byte_perm (w[34], w[35], selector); + w[62] = __byte_perm (w[33], w[34], selector); + w[61] = __byte_perm (w[32], w[33], selector); + w[60] = __byte_perm (w[31], w[32], selector); + w[59] = __byte_perm (w[30], w[31], selector); + w[58] = __byte_perm (w[29], w[30], selector); + w[57] = __byte_perm (w[28], w[29], selector); + w[56] = __byte_perm (w[27], w[28], selector); + w[55] = __byte_perm (w[26], w[27], selector); + w[54] = __byte_perm (w[25], w[26], selector); + w[53] = __byte_perm (w[24], w[25], selector); + w[52] = __byte_perm (w[23], w[24], selector); + w[51] = __byte_perm (w[22], w[23], selector); + w[50] = __byte_perm (w[21], w[22], selector); + w[49] = __byte_perm (w[20], w[21], selector); + w[48] = __byte_perm (w[19], w[20], selector); + w[47] = __byte_perm (w[18], w[19], selector); + w[46] = __byte_perm (w[17], w[18], selector); + w[45] = __byte_perm (w[16], w[17], selector); + w[44] = __byte_perm (w[15], w[16], selector); + w[43] = __byte_perm (w[14], w[15], selector); + w[42] = __byte_perm (w[13], w[14], selector); + w[41] = __byte_perm (w[12], w[13], selector); + w[40] = __byte_perm (w[11], w[12], selector); + w[39] = __byte_perm (w[10], w[11], selector); + w[38] = __byte_perm (w[ 9], w[10], selector); + w[37] = __byte_perm (w[ 8], w[ 9], selector); + w[36] = __byte_perm (w[ 7], w[ 8], selector); + w[35] = __byte_perm (w[ 6], w[ 7], selector); + w[34] = __byte_perm (w[ 5], w[ 6], selector); + w[33] = __byte_perm (w[ 4], w[ 5], selector); + w[32] = __byte_perm (w[ 3], w[ 4], selector); + w[31] = __byte_perm (w[ 2], w[ 3], selector); + w[30] = __byte_perm (w[ 1], w[ 2], selector); + w[29] = __byte_perm (w[ 0], w[ 1], selector); + w[28] = __byte_perm ( 0, w[ 0], selector); + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 29: + w[63] = __byte_perm (w[33], w[34], selector); + w[62] = __byte_perm (w[32], w[33], selector); + w[61] = __byte_perm (w[31], w[32], selector); + w[60] = __byte_perm (w[30], w[31], selector); + w[59] = __byte_perm (w[29], w[30], selector); + w[58] = __byte_perm (w[28], w[29], selector); + w[57] = __byte_perm (w[27], w[28], selector); + w[56] = __byte_perm (w[26], w[27], selector); + w[55] = __byte_perm (w[25], w[26], selector); + w[54] = __byte_perm (w[24], w[25], selector); + w[53] = __byte_perm (w[23], w[24], selector); + w[52] = __byte_perm (w[22], w[23], selector); + w[51] = __byte_perm (w[21], w[22], selector); + w[50] = __byte_perm (w[20], w[21], selector); + w[49] = __byte_perm (w[19], w[20], selector); + w[48] = __byte_perm (w[18], w[19], selector); + w[47] = __byte_perm (w[17], w[18], selector); + w[46] = __byte_perm (w[16], w[17], selector); + w[45] = __byte_perm (w[15], w[16], selector); + w[44] = __byte_perm (w[14], w[15], selector); + w[43] = __byte_perm (w[13], w[14], selector); + w[42] = __byte_perm (w[12], w[13], selector); + w[41] = __byte_perm (w[11], w[12], selector); + w[40] = __byte_perm (w[10], w[11], selector); + w[39] = __byte_perm (w[ 9], w[10], selector); + w[38] = __byte_perm (w[ 8], w[ 9], selector); + w[37] = __byte_perm (w[ 7], w[ 8], selector); + w[36] = __byte_perm (w[ 6], w[ 7], selector); + w[35] = __byte_perm (w[ 5], w[ 6], selector); + w[34] = __byte_perm (w[ 4], w[ 5], selector); + w[33] = __byte_perm (w[ 3], w[ 4], selector); + w[32] = __byte_perm (w[ 2], w[ 3], selector); + w[31] = __byte_perm (w[ 1], w[ 2], selector); + w[30] = __byte_perm (w[ 0], w[ 1], selector); + w[29] = __byte_perm ( 0, w[ 0], selector); + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 30: + w[63] = __byte_perm (w[32], w[33], selector); + w[62] = __byte_perm (w[31], w[32], selector); + w[61] = __byte_perm (w[30], w[31], selector); + w[60] = __byte_perm (w[29], w[30], selector); + w[59] = __byte_perm (w[28], w[29], selector); + w[58] = __byte_perm (w[27], w[28], selector); + w[57] = __byte_perm (w[26], w[27], selector); + w[56] = __byte_perm (w[25], w[26], selector); + w[55] = __byte_perm (w[24], w[25], selector); + w[54] = __byte_perm (w[23], w[24], selector); + w[53] = __byte_perm (w[22], w[23], selector); + w[52] = __byte_perm (w[21], w[22], selector); + w[51] = __byte_perm (w[20], w[21], selector); + w[50] = __byte_perm (w[19], w[20], selector); + w[49] = __byte_perm (w[18], w[19], selector); + w[48] = __byte_perm (w[17], w[18], selector); + w[47] = __byte_perm (w[16], w[17], selector); + w[46] = __byte_perm (w[15], w[16], selector); + w[45] = __byte_perm (w[14], w[15], selector); + w[44] = __byte_perm (w[13], w[14], selector); + w[43] = __byte_perm (w[12], w[13], selector); + w[42] = __byte_perm (w[11], w[12], selector); + w[41] = __byte_perm (w[10], w[11], selector); + w[40] = __byte_perm (w[ 9], w[10], selector); + w[39] = __byte_perm (w[ 8], w[ 9], selector); + w[38] = __byte_perm (w[ 7], w[ 8], selector); + w[37] = __byte_perm (w[ 6], w[ 7], selector); + w[36] = __byte_perm (w[ 5], w[ 6], selector); + w[35] = __byte_perm (w[ 4], w[ 5], selector); + w[34] = __byte_perm (w[ 3], w[ 4], selector); + w[33] = __byte_perm (w[ 2], w[ 3], selector); + w[32] = __byte_perm (w[ 1], w[ 2], selector); + w[31] = __byte_perm (w[ 0], w[ 1], selector); + w[30] = __byte_perm ( 0, w[ 0], selector); + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 31: + w[63] = __byte_perm (w[31], w[32], selector); + w[62] = __byte_perm (w[30], w[31], selector); + w[61] = __byte_perm (w[29], w[30], selector); + w[60] = __byte_perm (w[28], w[29], selector); + w[59] = __byte_perm (w[27], w[28], selector); + w[58] = __byte_perm (w[26], w[27], selector); + w[57] = __byte_perm (w[25], w[26], selector); + w[56] = __byte_perm (w[24], w[25], selector); + w[55] = __byte_perm (w[23], w[24], selector); + w[54] = __byte_perm (w[22], w[23], selector); + w[53] = __byte_perm (w[21], w[22], selector); + w[52] = __byte_perm (w[20], w[21], selector); + w[51] = __byte_perm (w[19], w[20], selector); + w[50] = __byte_perm (w[18], w[19], selector); + w[49] = __byte_perm (w[17], w[18], selector); + w[48] = __byte_perm (w[16], w[17], selector); + w[47] = __byte_perm (w[15], w[16], selector); + w[46] = __byte_perm (w[14], w[15], selector); + w[45] = __byte_perm (w[13], w[14], selector); + w[44] = __byte_perm (w[12], w[13], selector); + w[43] = __byte_perm (w[11], w[12], selector); + w[42] = __byte_perm (w[10], w[11], selector); + w[41] = __byte_perm (w[ 9], w[10], selector); + w[40] = __byte_perm (w[ 8], w[ 9], selector); + w[39] = __byte_perm (w[ 7], w[ 8], selector); + w[38] = __byte_perm (w[ 6], w[ 7], selector); + w[37] = __byte_perm (w[ 5], w[ 6], selector); + w[36] = __byte_perm (w[ 4], w[ 5], selector); + w[35] = __byte_perm (w[ 3], w[ 4], selector); + w[34] = __byte_perm (w[ 2], w[ 3], selector); + w[33] = __byte_perm (w[ 1], w[ 2], selector); + w[32] = __byte_perm (w[ 0], w[ 1], selector); + w[31] = __byte_perm ( 0, w[ 0], selector); + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 32: + w[63] = __byte_perm (w[30], w[31], selector); + w[62] = __byte_perm (w[29], w[30], selector); + w[61] = __byte_perm (w[28], w[29], selector); + w[60] = __byte_perm (w[27], w[28], selector); + w[59] = __byte_perm (w[26], w[27], selector); + w[58] = __byte_perm (w[25], w[26], selector); + w[57] = __byte_perm (w[24], w[25], selector); + w[56] = __byte_perm (w[23], w[24], selector); + w[55] = __byte_perm (w[22], w[23], selector); + w[54] = __byte_perm (w[21], w[22], selector); + w[53] = __byte_perm (w[20], w[21], selector); + w[52] = __byte_perm (w[19], w[20], selector); + w[51] = __byte_perm (w[18], w[19], selector); + w[50] = __byte_perm (w[17], w[18], selector); + w[49] = __byte_perm (w[16], w[17], selector); + w[48] = __byte_perm (w[15], w[16], selector); + w[47] = __byte_perm (w[14], w[15], selector); + w[46] = __byte_perm (w[13], w[14], selector); + w[45] = __byte_perm (w[12], w[13], selector); + w[44] = __byte_perm (w[11], w[12], selector); + w[43] = __byte_perm (w[10], w[11], selector); + w[42] = __byte_perm (w[ 9], w[10], selector); + w[41] = __byte_perm (w[ 8], w[ 9], selector); + w[40] = __byte_perm (w[ 7], w[ 8], selector); + w[39] = __byte_perm (w[ 6], w[ 7], selector); + w[38] = __byte_perm (w[ 5], w[ 6], selector); + w[37] = __byte_perm (w[ 4], w[ 5], selector); + w[36] = __byte_perm (w[ 3], w[ 4], selector); + w[35] = __byte_perm (w[ 2], w[ 3], selector); + w[34] = __byte_perm (w[ 1], w[ 2], selector); + w[33] = __byte_perm (w[ 0], w[ 1], selector); + w[32] = __byte_perm ( 0, w[ 0], selector); + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 33: + w[63] = __byte_perm (w[29], w[30], selector); + w[62] = __byte_perm (w[28], w[29], selector); + w[61] = __byte_perm (w[27], w[28], selector); + w[60] = __byte_perm (w[26], w[27], selector); + w[59] = __byte_perm (w[25], w[26], selector); + w[58] = __byte_perm (w[24], w[25], selector); + w[57] = __byte_perm (w[23], w[24], selector); + w[56] = __byte_perm (w[22], w[23], selector); + w[55] = __byte_perm (w[21], w[22], selector); + w[54] = __byte_perm (w[20], w[21], selector); + w[53] = __byte_perm (w[19], w[20], selector); + w[52] = __byte_perm (w[18], w[19], selector); + w[51] = __byte_perm (w[17], w[18], selector); + w[50] = __byte_perm (w[16], w[17], selector); + w[49] = __byte_perm (w[15], w[16], selector); + w[48] = __byte_perm (w[14], w[15], selector); + w[47] = __byte_perm (w[13], w[14], selector); + w[46] = __byte_perm (w[12], w[13], selector); + w[45] = __byte_perm (w[11], w[12], selector); + w[44] = __byte_perm (w[10], w[11], selector); + w[43] = __byte_perm (w[ 9], w[10], selector); + w[42] = __byte_perm (w[ 8], w[ 9], selector); + w[41] = __byte_perm (w[ 7], w[ 8], selector); + w[40] = __byte_perm (w[ 6], w[ 7], selector); + w[39] = __byte_perm (w[ 5], w[ 6], selector); + w[38] = __byte_perm (w[ 4], w[ 5], selector); + w[37] = __byte_perm (w[ 3], w[ 4], selector); + w[36] = __byte_perm (w[ 2], w[ 3], selector); + w[35] = __byte_perm (w[ 1], w[ 2], selector); + w[34] = __byte_perm (w[ 0], w[ 1], selector); + w[33] = __byte_perm ( 0, w[ 0], selector); + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 34: + w[63] = __byte_perm (w[28], w[29], selector); + w[62] = __byte_perm (w[27], w[28], selector); + w[61] = __byte_perm (w[26], w[27], selector); + w[60] = __byte_perm (w[25], w[26], selector); + w[59] = __byte_perm (w[24], w[25], selector); + w[58] = __byte_perm (w[23], w[24], selector); + w[57] = __byte_perm (w[22], w[23], selector); + w[56] = __byte_perm (w[21], w[22], selector); + w[55] = __byte_perm (w[20], w[21], selector); + w[54] = __byte_perm (w[19], w[20], selector); + w[53] = __byte_perm (w[18], w[19], selector); + w[52] = __byte_perm (w[17], w[18], selector); + w[51] = __byte_perm (w[16], w[17], selector); + w[50] = __byte_perm (w[15], w[16], selector); + w[49] = __byte_perm (w[14], w[15], selector); + w[48] = __byte_perm (w[13], w[14], selector); + w[47] = __byte_perm (w[12], w[13], selector); + w[46] = __byte_perm (w[11], w[12], selector); + w[45] = __byte_perm (w[10], w[11], selector); + w[44] = __byte_perm (w[ 9], w[10], selector); + w[43] = __byte_perm (w[ 8], w[ 9], selector); + w[42] = __byte_perm (w[ 7], w[ 8], selector); + w[41] = __byte_perm (w[ 6], w[ 7], selector); + w[40] = __byte_perm (w[ 5], w[ 6], selector); + w[39] = __byte_perm (w[ 4], w[ 5], selector); + w[38] = __byte_perm (w[ 3], w[ 4], selector); + w[37] = __byte_perm (w[ 2], w[ 3], selector); + w[36] = __byte_perm (w[ 1], w[ 2], selector); + w[35] = __byte_perm (w[ 0], w[ 1], selector); + w[34] = __byte_perm ( 0, w[ 0], selector); + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 35: + w[63] = __byte_perm (w[27], w[28], selector); + w[62] = __byte_perm (w[26], w[27], selector); + w[61] = __byte_perm (w[25], w[26], selector); + w[60] = __byte_perm (w[24], w[25], selector); + w[59] = __byte_perm (w[23], w[24], selector); + w[58] = __byte_perm (w[22], w[23], selector); + w[57] = __byte_perm (w[21], w[22], selector); + w[56] = __byte_perm (w[20], w[21], selector); + w[55] = __byte_perm (w[19], w[20], selector); + w[54] = __byte_perm (w[18], w[19], selector); + w[53] = __byte_perm (w[17], w[18], selector); + w[52] = __byte_perm (w[16], w[17], selector); + w[51] = __byte_perm (w[15], w[16], selector); + w[50] = __byte_perm (w[14], w[15], selector); + w[49] = __byte_perm (w[13], w[14], selector); + w[48] = __byte_perm (w[12], w[13], selector); + w[47] = __byte_perm (w[11], w[12], selector); + w[46] = __byte_perm (w[10], w[11], selector); + w[45] = __byte_perm (w[ 9], w[10], selector); + w[44] = __byte_perm (w[ 8], w[ 9], selector); + w[43] = __byte_perm (w[ 7], w[ 8], selector); + w[42] = __byte_perm (w[ 6], w[ 7], selector); + w[41] = __byte_perm (w[ 5], w[ 6], selector); + w[40] = __byte_perm (w[ 4], w[ 5], selector); + w[39] = __byte_perm (w[ 3], w[ 4], selector); + w[38] = __byte_perm (w[ 2], w[ 3], selector); + w[37] = __byte_perm (w[ 1], w[ 2], selector); + w[36] = __byte_perm (w[ 0], w[ 1], selector); + w[35] = __byte_perm ( 0, w[ 0], selector); + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 36: + w[63] = __byte_perm (w[26], w[27], selector); + w[62] = __byte_perm (w[25], w[26], selector); + w[61] = __byte_perm (w[24], w[25], selector); + w[60] = __byte_perm (w[23], w[24], selector); + w[59] = __byte_perm (w[22], w[23], selector); + w[58] = __byte_perm (w[21], w[22], selector); + w[57] = __byte_perm (w[20], w[21], selector); + w[56] = __byte_perm (w[19], w[20], selector); + w[55] = __byte_perm (w[18], w[19], selector); + w[54] = __byte_perm (w[17], w[18], selector); + w[53] = __byte_perm (w[16], w[17], selector); + w[52] = __byte_perm (w[15], w[16], selector); + w[51] = __byte_perm (w[14], w[15], selector); + w[50] = __byte_perm (w[13], w[14], selector); + w[49] = __byte_perm (w[12], w[13], selector); + w[48] = __byte_perm (w[11], w[12], selector); + w[47] = __byte_perm (w[10], w[11], selector); + w[46] = __byte_perm (w[ 9], w[10], selector); + w[45] = __byte_perm (w[ 8], w[ 9], selector); + w[44] = __byte_perm (w[ 7], w[ 8], selector); + w[43] = __byte_perm (w[ 6], w[ 7], selector); + w[42] = __byte_perm (w[ 5], w[ 6], selector); + w[41] = __byte_perm (w[ 4], w[ 5], selector); + w[40] = __byte_perm (w[ 3], w[ 4], selector); + w[39] = __byte_perm (w[ 2], w[ 3], selector); + w[38] = __byte_perm (w[ 1], w[ 2], selector); + w[37] = __byte_perm (w[ 0], w[ 1], selector); + w[36] = __byte_perm ( 0, w[ 0], selector); + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 37: + w[63] = __byte_perm (w[25], w[26], selector); + w[62] = __byte_perm (w[24], w[25], selector); + w[61] = __byte_perm (w[23], w[24], selector); + w[60] = __byte_perm (w[22], w[23], selector); + w[59] = __byte_perm (w[21], w[22], selector); + w[58] = __byte_perm (w[20], w[21], selector); + w[57] = __byte_perm (w[19], w[20], selector); + w[56] = __byte_perm (w[18], w[19], selector); + w[55] = __byte_perm (w[17], w[18], selector); + w[54] = __byte_perm (w[16], w[17], selector); + w[53] = __byte_perm (w[15], w[16], selector); + w[52] = __byte_perm (w[14], w[15], selector); + w[51] = __byte_perm (w[13], w[14], selector); + w[50] = __byte_perm (w[12], w[13], selector); + w[49] = __byte_perm (w[11], w[12], selector); + w[48] = __byte_perm (w[10], w[11], selector); + w[47] = __byte_perm (w[ 9], w[10], selector); + w[46] = __byte_perm (w[ 8], w[ 9], selector); + w[45] = __byte_perm (w[ 7], w[ 8], selector); + w[44] = __byte_perm (w[ 6], w[ 7], selector); + w[43] = __byte_perm (w[ 5], w[ 6], selector); + w[42] = __byte_perm (w[ 4], w[ 5], selector); + w[41] = __byte_perm (w[ 3], w[ 4], selector); + w[40] = __byte_perm (w[ 2], w[ 3], selector); + w[39] = __byte_perm (w[ 1], w[ 2], selector); + w[38] = __byte_perm (w[ 0], w[ 1], selector); + w[37] = __byte_perm ( 0, w[ 0], selector); + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 38: + w[63] = __byte_perm (w[24], w[25], selector); + w[62] = __byte_perm (w[23], w[24], selector); + w[61] = __byte_perm (w[22], w[23], selector); + w[60] = __byte_perm (w[21], w[22], selector); + w[59] = __byte_perm (w[20], w[21], selector); + w[58] = __byte_perm (w[19], w[20], selector); + w[57] = __byte_perm (w[18], w[19], selector); + w[56] = __byte_perm (w[17], w[18], selector); + w[55] = __byte_perm (w[16], w[17], selector); + w[54] = __byte_perm (w[15], w[16], selector); + w[53] = __byte_perm (w[14], w[15], selector); + w[52] = __byte_perm (w[13], w[14], selector); + w[51] = __byte_perm (w[12], w[13], selector); + w[50] = __byte_perm (w[11], w[12], selector); + w[49] = __byte_perm (w[10], w[11], selector); + w[48] = __byte_perm (w[ 9], w[10], selector); + w[47] = __byte_perm (w[ 8], w[ 9], selector); + w[46] = __byte_perm (w[ 7], w[ 8], selector); + w[45] = __byte_perm (w[ 6], w[ 7], selector); + w[44] = __byte_perm (w[ 5], w[ 6], selector); + w[43] = __byte_perm (w[ 4], w[ 5], selector); + w[42] = __byte_perm (w[ 3], w[ 4], selector); + w[41] = __byte_perm (w[ 2], w[ 3], selector); + w[40] = __byte_perm (w[ 1], w[ 2], selector); + w[39] = __byte_perm (w[ 0], w[ 1], selector); + w[38] = __byte_perm ( 0, w[ 0], selector); + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 39: + w[63] = __byte_perm (w[23], w[24], selector); + w[62] = __byte_perm (w[22], w[23], selector); + w[61] = __byte_perm (w[21], w[22], selector); + w[60] = __byte_perm (w[20], w[21], selector); + w[59] = __byte_perm (w[19], w[20], selector); + w[58] = __byte_perm (w[18], w[19], selector); + w[57] = __byte_perm (w[17], w[18], selector); + w[56] = __byte_perm (w[16], w[17], selector); + w[55] = __byte_perm (w[15], w[16], selector); + w[54] = __byte_perm (w[14], w[15], selector); + w[53] = __byte_perm (w[13], w[14], selector); + w[52] = __byte_perm (w[12], w[13], selector); + w[51] = __byte_perm (w[11], w[12], selector); + w[50] = __byte_perm (w[10], w[11], selector); + w[49] = __byte_perm (w[ 9], w[10], selector); + w[48] = __byte_perm (w[ 8], w[ 9], selector); + w[47] = __byte_perm (w[ 7], w[ 8], selector); + w[46] = __byte_perm (w[ 6], w[ 7], selector); + w[45] = __byte_perm (w[ 5], w[ 6], selector); + w[44] = __byte_perm (w[ 4], w[ 5], selector); + w[43] = __byte_perm (w[ 3], w[ 4], selector); + w[42] = __byte_perm (w[ 2], w[ 3], selector); + w[41] = __byte_perm (w[ 1], w[ 2], selector); + w[40] = __byte_perm (w[ 0], w[ 1], selector); + w[39] = __byte_perm ( 0, w[ 0], selector); + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 40: + w[63] = __byte_perm (w[22], w[23], selector); + w[62] = __byte_perm (w[21], w[22], selector); + w[61] = __byte_perm (w[20], w[21], selector); + w[60] = __byte_perm (w[19], w[20], selector); + w[59] = __byte_perm (w[18], w[19], selector); + w[58] = __byte_perm (w[17], w[18], selector); + w[57] = __byte_perm (w[16], w[17], selector); + w[56] = __byte_perm (w[15], w[16], selector); + w[55] = __byte_perm (w[14], w[15], selector); + w[54] = __byte_perm (w[13], w[14], selector); + w[53] = __byte_perm (w[12], w[13], selector); + w[52] = __byte_perm (w[11], w[12], selector); + w[51] = __byte_perm (w[10], w[11], selector); + w[50] = __byte_perm (w[ 9], w[10], selector); + w[49] = __byte_perm (w[ 8], w[ 9], selector); + w[48] = __byte_perm (w[ 7], w[ 8], selector); + w[47] = __byte_perm (w[ 6], w[ 7], selector); + w[46] = __byte_perm (w[ 5], w[ 6], selector); + w[45] = __byte_perm (w[ 4], w[ 5], selector); + w[44] = __byte_perm (w[ 3], w[ 4], selector); + w[43] = __byte_perm (w[ 2], w[ 3], selector); + w[42] = __byte_perm (w[ 1], w[ 2], selector); + w[41] = __byte_perm (w[ 0], w[ 1], selector); + w[40] = __byte_perm ( 0, w[ 0], selector); + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 41: + w[63] = __byte_perm (w[21], w[22], selector); + w[62] = __byte_perm (w[20], w[21], selector); + w[61] = __byte_perm (w[19], w[20], selector); + w[60] = __byte_perm (w[18], w[19], selector); + w[59] = __byte_perm (w[17], w[18], selector); + w[58] = __byte_perm (w[16], w[17], selector); + w[57] = __byte_perm (w[15], w[16], selector); + w[56] = __byte_perm (w[14], w[15], selector); + w[55] = __byte_perm (w[13], w[14], selector); + w[54] = __byte_perm (w[12], w[13], selector); + w[53] = __byte_perm (w[11], w[12], selector); + w[52] = __byte_perm (w[10], w[11], selector); + w[51] = __byte_perm (w[ 9], w[10], selector); + w[50] = __byte_perm (w[ 8], w[ 9], selector); + w[49] = __byte_perm (w[ 7], w[ 8], selector); + w[48] = __byte_perm (w[ 6], w[ 7], selector); + w[47] = __byte_perm (w[ 5], w[ 6], selector); + w[46] = __byte_perm (w[ 4], w[ 5], selector); + w[45] = __byte_perm (w[ 3], w[ 4], selector); + w[44] = __byte_perm (w[ 2], w[ 3], selector); + w[43] = __byte_perm (w[ 1], w[ 2], selector); + w[42] = __byte_perm (w[ 0], w[ 1], selector); + w[41] = __byte_perm ( 0, w[ 0], selector); + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 42: + w[63] = __byte_perm (w[20], w[21], selector); + w[62] = __byte_perm (w[19], w[20], selector); + w[61] = __byte_perm (w[18], w[19], selector); + w[60] = __byte_perm (w[17], w[18], selector); + w[59] = __byte_perm (w[16], w[17], selector); + w[58] = __byte_perm (w[15], w[16], selector); + w[57] = __byte_perm (w[14], w[15], selector); + w[56] = __byte_perm (w[13], w[14], selector); + w[55] = __byte_perm (w[12], w[13], selector); + w[54] = __byte_perm (w[11], w[12], selector); + w[53] = __byte_perm (w[10], w[11], selector); + w[52] = __byte_perm (w[ 9], w[10], selector); + w[51] = __byte_perm (w[ 8], w[ 9], selector); + w[50] = __byte_perm (w[ 7], w[ 8], selector); + w[49] = __byte_perm (w[ 6], w[ 7], selector); + w[48] = __byte_perm (w[ 5], w[ 6], selector); + w[47] = __byte_perm (w[ 4], w[ 5], selector); + w[46] = __byte_perm (w[ 3], w[ 4], selector); + w[45] = __byte_perm (w[ 2], w[ 3], selector); + w[44] = __byte_perm (w[ 1], w[ 2], selector); + w[43] = __byte_perm (w[ 0], w[ 1], selector); + w[42] = __byte_perm ( 0, w[ 0], selector); + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 43: + w[63] = __byte_perm (w[19], w[20], selector); + w[62] = __byte_perm (w[18], w[19], selector); + w[61] = __byte_perm (w[17], w[18], selector); + w[60] = __byte_perm (w[16], w[17], selector); + w[59] = __byte_perm (w[15], w[16], selector); + w[58] = __byte_perm (w[14], w[15], selector); + w[57] = __byte_perm (w[13], w[14], selector); + w[56] = __byte_perm (w[12], w[13], selector); + w[55] = __byte_perm (w[11], w[12], selector); + w[54] = __byte_perm (w[10], w[11], selector); + w[53] = __byte_perm (w[ 9], w[10], selector); + w[52] = __byte_perm (w[ 8], w[ 9], selector); + w[51] = __byte_perm (w[ 7], w[ 8], selector); + w[50] = __byte_perm (w[ 6], w[ 7], selector); + w[49] = __byte_perm (w[ 5], w[ 6], selector); + w[48] = __byte_perm (w[ 4], w[ 5], selector); + w[47] = __byte_perm (w[ 3], w[ 4], selector); + w[46] = __byte_perm (w[ 2], w[ 3], selector); + w[45] = __byte_perm (w[ 1], w[ 2], selector); + w[44] = __byte_perm (w[ 0], w[ 1], selector); + w[43] = __byte_perm ( 0, w[ 0], selector); + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 44: + w[63] = __byte_perm (w[18], w[19], selector); + w[62] = __byte_perm (w[17], w[18], selector); + w[61] = __byte_perm (w[16], w[17], selector); + w[60] = __byte_perm (w[15], w[16], selector); + w[59] = __byte_perm (w[14], w[15], selector); + w[58] = __byte_perm (w[13], w[14], selector); + w[57] = __byte_perm (w[12], w[13], selector); + w[56] = __byte_perm (w[11], w[12], selector); + w[55] = __byte_perm (w[10], w[11], selector); + w[54] = __byte_perm (w[ 9], w[10], selector); + w[53] = __byte_perm (w[ 8], w[ 9], selector); + w[52] = __byte_perm (w[ 7], w[ 8], selector); + w[51] = __byte_perm (w[ 6], w[ 7], selector); + w[50] = __byte_perm (w[ 5], w[ 6], selector); + w[49] = __byte_perm (w[ 4], w[ 5], selector); + w[48] = __byte_perm (w[ 3], w[ 4], selector); + w[47] = __byte_perm (w[ 2], w[ 3], selector); + w[46] = __byte_perm (w[ 1], w[ 2], selector); + w[45] = __byte_perm (w[ 0], w[ 1], selector); + w[44] = __byte_perm ( 0, w[ 0], selector); + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 45: + w[63] = __byte_perm (w[17], w[18], selector); + w[62] = __byte_perm (w[16], w[17], selector); + w[61] = __byte_perm (w[15], w[16], selector); + w[60] = __byte_perm (w[14], w[15], selector); + w[59] = __byte_perm (w[13], w[14], selector); + w[58] = __byte_perm (w[12], w[13], selector); + w[57] = __byte_perm (w[11], w[12], selector); + w[56] = __byte_perm (w[10], w[11], selector); + w[55] = __byte_perm (w[ 9], w[10], selector); + w[54] = __byte_perm (w[ 8], w[ 9], selector); + w[53] = __byte_perm (w[ 7], w[ 8], selector); + w[52] = __byte_perm (w[ 6], w[ 7], selector); + w[51] = __byte_perm (w[ 5], w[ 6], selector); + w[50] = __byte_perm (w[ 4], w[ 5], selector); + w[49] = __byte_perm (w[ 3], w[ 4], selector); + w[48] = __byte_perm (w[ 2], w[ 3], selector); + w[47] = __byte_perm (w[ 1], w[ 2], selector); + w[46] = __byte_perm (w[ 0], w[ 1], selector); + w[45] = __byte_perm ( 0, w[ 0], selector); + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 46: + w[63] = __byte_perm (w[16], w[17], selector); + w[62] = __byte_perm (w[15], w[16], selector); + w[61] = __byte_perm (w[14], w[15], selector); + w[60] = __byte_perm (w[13], w[14], selector); + w[59] = __byte_perm (w[12], w[13], selector); + w[58] = __byte_perm (w[11], w[12], selector); + w[57] = __byte_perm (w[10], w[11], selector); + w[56] = __byte_perm (w[ 9], w[10], selector); + w[55] = __byte_perm (w[ 8], w[ 9], selector); + w[54] = __byte_perm (w[ 7], w[ 8], selector); + w[53] = __byte_perm (w[ 6], w[ 7], selector); + w[52] = __byte_perm (w[ 5], w[ 6], selector); + w[51] = __byte_perm (w[ 4], w[ 5], selector); + w[50] = __byte_perm (w[ 3], w[ 4], selector); + w[49] = __byte_perm (w[ 2], w[ 3], selector); + w[48] = __byte_perm (w[ 1], w[ 2], selector); + w[47] = __byte_perm (w[ 0], w[ 1], selector); + w[46] = __byte_perm ( 0, w[ 0], selector); + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 47: + w[63] = __byte_perm (w[15], w[16], selector); + w[62] = __byte_perm (w[14], w[15], selector); + w[61] = __byte_perm (w[13], w[14], selector); + w[60] = __byte_perm (w[12], w[13], selector); + w[59] = __byte_perm (w[11], w[12], selector); + w[58] = __byte_perm (w[10], w[11], selector); + w[57] = __byte_perm (w[ 9], w[10], selector); + w[56] = __byte_perm (w[ 8], w[ 9], selector); + w[55] = __byte_perm (w[ 7], w[ 8], selector); + w[54] = __byte_perm (w[ 6], w[ 7], selector); + w[53] = __byte_perm (w[ 5], w[ 6], selector); + w[52] = __byte_perm (w[ 4], w[ 5], selector); + w[51] = __byte_perm (w[ 3], w[ 4], selector); + w[50] = __byte_perm (w[ 2], w[ 3], selector); + w[49] = __byte_perm (w[ 1], w[ 2], selector); + w[48] = __byte_perm (w[ 0], w[ 1], selector); + w[47] = __byte_perm ( 0, w[ 0], selector); + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 48: + w[63] = __byte_perm (w[14], w[15], selector); + w[62] = __byte_perm (w[13], w[14], selector); + w[61] = __byte_perm (w[12], w[13], selector); + w[60] = __byte_perm (w[11], w[12], selector); + w[59] = __byte_perm (w[10], w[11], selector); + w[58] = __byte_perm (w[ 9], w[10], selector); + w[57] = __byte_perm (w[ 8], w[ 9], selector); + w[56] = __byte_perm (w[ 7], w[ 8], selector); + w[55] = __byte_perm (w[ 6], w[ 7], selector); + w[54] = __byte_perm (w[ 5], w[ 6], selector); + w[53] = __byte_perm (w[ 4], w[ 5], selector); + w[52] = __byte_perm (w[ 3], w[ 4], selector); + w[51] = __byte_perm (w[ 2], w[ 3], selector); + w[50] = __byte_perm (w[ 1], w[ 2], selector); + w[49] = __byte_perm (w[ 0], w[ 1], selector); + w[48] = __byte_perm ( 0, w[ 0], selector); + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 49: + w[63] = __byte_perm (w[13], w[14], selector); + w[62] = __byte_perm (w[12], w[13], selector); + w[61] = __byte_perm (w[11], w[12], selector); + w[60] = __byte_perm (w[10], w[11], selector); + w[59] = __byte_perm (w[ 9], w[10], selector); + w[58] = __byte_perm (w[ 8], w[ 9], selector); + w[57] = __byte_perm (w[ 7], w[ 8], selector); + w[56] = __byte_perm (w[ 6], w[ 7], selector); + w[55] = __byte_perm (w[ 5], w[ 6], selector); + w[54] = __byte_perm (w[ 4], w[ 5], selector); + w[53] = __byte_perm (w[ 3], w[ 4], selector); + w[52] = __byte_perm (w[ 2], w[ 3], selector); + w[51] = __byte_perm (w[ 1], w[ 2], selector); + w[50] = __byte_perm (w[ 0], w[ 1], selector); + w[49] = __byte_perm ( 0, w[ 0], selector); + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 50: + w[63] = __byte_perm (w[12], w[13], selector); + w[62] = __byte_perm (w[11], w[12], selector); + w[61] = __byte_perm (w[10], w[11], selector); + w[60] = __byte_perm (w[ 9], w[10], selector); + w[59] = __byte_perm (w[ 8], w[ 9], selector); + w[58] = __byte_perm (w[ 7], w[ 8], selector); + w[57] = __byte_perm (w[ 6], w[ 7], selector); + w[56] = __byte_perm (w[ 5], w[ 6], selector); + w[55] = __byte_perm (w[ 4], w[ 5], selector); + w[54] = __byte_perm (w[ 3], w[ 4], selector); + w[53] = __byte_perm (w[ 2], w[ 3], selector); + w[52] = __byte_perm (w[ 1], w[ 2], selector); + w[51] = __byte_perm (w[ 0], w[ 1], selector); + w[50] = __byte_perm ( 0, w[ 0], selector); + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 51: + w[63] = __byte_perm (w[11], w[12], selector); + w[62] = __byte_perm (w[10], w[11], selector); + w[61] = __byte_perm (w[ 9], w[10], selector); + w[60] = __byte_perm (w[ 8], w[ 9], selector); + w[59] = __byte_perm (w[ 7], w[ 8], selector); + w[58] = __byte_perm (w[ 6], w[ 7], selector); + w[57] = __byte_perm (w[ 5], w[ 6], selector); + w[56] = __byte_perm (w[ 4], w[ 5], selector); + w[55] = __byte_perm (w[ 3], w[ 4], selector); + w[54] = __byte_perm (w[ 2], w[ 3], selector); + w[53] = __byte_perm (w[ 1], w[ 2], selector); + w[52] = __byte_perm (w[ 0], w[ 1], selector); + w[51] = __byte_perm ( 0, w[ 0], selector); + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 52: + w[63] = __byte_perm (w[10], w[11], selector); + w[62] = __byte_perm (w[ 9], w[10], selector); + w[61] = __byte_perm (w[ 8], w[ 9], selector); + w[60] = __byte_perm (w[ 7], w[ 8], selector); + w[59] = __byte_perm (w[ 6], w[ 7], selector); + w[58] = __byte_perm (w[ 5], w[ 6], selector); + w[57] = __byte_perm (w[ 4], w[ 5], selector); + w[56] = __byte_perm (w[ 3], w[ 4], selector); + w[55] = __byte_perm (w[ 2], w[ 3], selector); + w[54] = __byte_perm (w[ 1], w[ 2], selector); + w[53] = __byte_perm (w[ 0], w[ 1], selector); + w[52] = __byte_perm ( 0, w[ 0], selector); + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 53: + w[63] = __byte_perm (w[ 9], w[10], selector); + w[62] = __byte_perm (w[ 8], w[ 9], selector); + w[61] = __byte_perm (w[ 7], w[ 8], selector); + w[60] = __byte_perm (w[ 6], w[ 7], selector); + w[59] = __byte_perm (w[ 5], w[ 6], selector); + w[58] = __byte_perm (w[ 4], w[ 5], selector); + w[57] = __byte_perm (w[ 3], w[ 4], selector); + w[56] = __byte_perm (w[ 2], w[ 3], selector); + w[55] = __byte_perm (w[ 1], w[ 2], selector); + w[54] = __byte_perm (w[ 0], w[ 1], selector); + w[53] = __byte_perm ( 0, w[ 0], selector); + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 54: + w[63] = __byte_perm (w[ 8], w[ 9], selector); + w[62] = __byte_perm (w[ 7], w[ 8], selector); + w[61] = __byte_perm (w[ 6], w[ 7], selector); + w[60] = __byte_perm (w[ 5], w[ 6], selector); + w[59] = __byte_perm (w[ 4], w[ 5], selector); + w[58] = __byte_perm (w[ 3], w[ 4], selector); + w[57] = __byte_perm (w[ 2], w[ 3], selector); + w[56] = __byte_perm (w[ 1], w[ 2], selector); + w[55] = __byte_perm (w[ 0], w[ 1], selector); + w[54] = __byte_perm ( 0, w[ 0], selector); + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 55: + w[63] = __byte_perm (w[ 7], w[ 8], selector); + w[62] = __byte_perm (w[ 6], w[ 7], selector); + w[61] = __byte_perm (w[ 5], w[ 6], selector); + w[60] = __byte_perm (w[ 4], w[ 5], selector); + w[59] = __byte_perm (w[ 3], w[ 4], selector); + w[58] = __byte_perm (w[ 2], w[ 3], selector); + w[57] = __byte_perm (w[ 1], w[ 2], selector); + w[56] = __byte_perm (w[ 0], w[ 1], selector); + w[55] = __byte_perm ( 0, w[ 0], selector); + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 56: + w[63] = __byte_perm (w[ 6], w[ 7], selector); + w[62] = __byte_perm (w[ 5], w[ 6], selector); + w[61] = __byte_perm (w[ 4], w[ 5], selector); + w[60] = __byte_perm (w[ 3], w[ 4], selector); + w[59] = __byte_perm (w[ 2], w[ 3], selector); + w[58] = __byte_perm (w[ 1], w[ 2], selector); + w[57] = __byte_perm (w[ 0], w[ 1], selector); + w[56] = __byte_perm ( 0, w[ 0], selector); + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 57: + w[63] = __byte_perm (w[ 5], w[ 6], selector); + w[62] = __byte_perm (w[ 4], w[ 5], selector); + w[61] = __byte_perm (w[ 3], w[ 4], selector); + w[60] = __byte_perm (w[ 2], w[ 3], selector); + w[59] = __byte_perm (w[ 1], w[ 2], selector); + w[58] = __byte_perm (w[ 0], w[ 1], selector); + w[57] = __byte_perm ( 0, w[ 0], selector); + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 58: + w[63] = __byte_perm (w[ 4], w[ 5], selector); + w[62] = __byte_perm (w[ 3], w[ 4], selector); + w[61] = __byte_perm (w[ 2], w[ 3], selector); + w[60] = __byte_perm (w[ 1], w[ 2], selector); + w[59] = __byte_perm (w[ 0], w[ 1], selector); + w[58] = __byte_perm ( 0, w[ 0], selector); + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 59: + w[63] = __byte_perm (w[ 3], w[ 4], selector); + w[62] = __byte_perm (w[ 2], w[ 3], selector); + w[61] = __byte_perm (w[ 1], w[ 2], selector); + w[60] = __byte_perm (w[ 0], w[ 1], selector); + w[59] = __byte_perm ( 0, w[ 0], selector); + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 60: + w[63] = __byte_perm (w[ 2], w[ 3], selector); + w[62] = __byte_perm (w[ 1], w[ 2], selector); + w[61] = __byte_perm (w[ 0], w[ 1], selector); + w[60] = __byte_perm ( 0, w[ 0], selector); + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 61: + w[63] = __byte_perm (w[ 1], w[ 2], selector); + w[62] = __byte_perm (w[ 0], w[ 1], selector); + w[61] = __byte_perm ( 0, w[ 0], selector); + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 62: + w[63] = __byte_perm (w[ 0], w[ 1], selector); + w[62] = __byte_perm ( 0, w[ 0], selector); + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 63: + w[63] = __byte_perm ( 0, w[ 0], selector); + w[62] = 0; + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + } + #endif +} + +void switch_buffer_by_offset_1x64_be (u32x w[64], const u32 offset) +{ + #if defined IS_AMD || defined IS_GENERIC + switch (offset / 4) + { + case 0: + w[63] = amd_bytealign (w[62], w[63], offset); + w[62] = amd_bytealign (w[61], w[62], offset); + w[61] = amd_bytealign (w[60], w[61], offset); + w[60] = amd_bytealign (w[59], w[60], offset); + w[59] = amd_bytealign (w[58], w[59], offset); + w[58] = amd_bytealign (w[57], w[58], offset); + w[57] = amd_bytealign (w[56], w[57], offset); + w[56] = amd_bytealign (w[55], w[56], offset); + w[55] = amd_bytealign (w[54], w[55], offset); + w[54] = amd_bytealign (w[53], w[54], offset); + w[53] = amd_bytealign (w[52], w[53], offset); + w[52] = amd_bytealign (w[51], w[52], offset); + w[51] = amd_bytealign (w[50], w[51], offset); + w[50] = amd_bytealign (w[49], w[50], offset); + w[49] = amd_bytealign (w[48], w[49], offset); + w[48] = amd_bytealign (w[47], w[48], offset); + w[47] = amd_bytealign (w[46], w[47], offset); + w[46] = amd_bytealign (w[45], w[46], offset); + w[45] = amd_bytealign (w[44], w[45], offset); + w[44] = amd_bytealign (w[43], w[44], offset); + w[43] = amd_bytealign (w[42], w[43], offset); + w[42] = amd_bytealign (w[41], w[42], offset); + w[41] = amd_bytealign (w[40], w[41], offset); + w[40] = amd_bytealign (w[39], w[40], offset); + w[39] = amd_bytealign (w[38], w[39], offset); + w[38] = amd_bytealign (w[37], w[38], offset); + w[37] = amd_bytealign (w[36], w[37], offset); + w[36] = amd_bytealign (w[35], w[36], offset); + w[35] = amd_bytealign (w[34], w[35], offset); + w[34] = amd_bytealign (w[33], w[34], offset); + w[33] = amd_bytealign (w[32], w[33], offset); + w[32] = amd_bytealign (w[31], w[32], offset); + w[31] = amd_bytealign (w[30], w[31], offset); + w[30] = amd_bytealign (w[29], w[30], offset); + w[29] = amd_bytealign (w[28], w[29], offset); + w[28] = amd_bytealign (w[27], w[28], offset); + w[27] = amd_bytealign (w[26], w[27], offset); + w[26] = amd_bytealign (w[25], w[26], offset); + w[25] = amd_bytealign (w[24], w[25], offset); + w[24] = amd_bytealign (w[23], w[24], offset); + w[23] = amd_bytealign (w[22], w[23], offset); + w[22] = amd_bytealign (w[21], w[22], offset); + w[21] = amd_bytealign (w[20], w[21], offset); + w[20] = amd_bytealign (w[19], w[20], offset); + w[19] = amd_bytealign (w[18], w[19], offset); + w[18] = amd_bytealign (w[17], w[18], offset); + w[17] = amd_bytealign (w[16], w[17], offset); + w[16] = amd_bytealign (w[15], w[16], offset); + w[15] = amd_bytealign (w[14], w[15], offset); + w[14] = amd_bytealign (w[13], w[14], offset); + w[13] = amd_bytealign (w[12], w[13], offset); + w[12] = amd_bytealign (w[11], w[12], offset); + w[11] = amd_bytealign (w[10], w[11], offset); + w[10] = amd_bytealign (w[ 9], w[10], offset); + w[ 9] = amd_bytealign (w[ 8], w[ 9], offset); + w[ 8] = amd_bytealign (w[ 7], w[ 8], offset); + w[ 7] = amd_bytealign (w[ 6], w[ 7], offset); + w[ 6] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 5] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 4] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 3] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 2] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 1] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 0] = amd_bytealign ( 0, w[ 0], offset); + + break; + + case 1: + w[63] = amd_bytealign (w[61], w[62], offset); + w[62] = amd_bytealign (w[60], w[61], offset); + w[61] = amd_bytealign (w[59], w[60], offset); + w[60] = amd_bytealign (w[58], w[59], offset); + w[59] = amd_bytealign (w[57], w[58], offset); + w[58] = amd_bytealign (w[56], w[57], offset); + w[57] = amd_bytealign (w[55], w[56], offset); + w[56] = amd_bytealign (w[54], w[55], offset); + w[55] = amd_bytealign (w[53], w[54], offset); + w[54] = amd_bytealign (w[52], w[53], offset); + w[53] = amd_bytealign (w[51], w[52], offset); + w[52] = amd_bytealign (w[50], w[51], offset); + w[51] = amd_bytealign (w[49], w[50], offset); + w[50] = amd_bytealign (w[48], w[49], offset); + w[49] = amd_bytealign (w[47], w[48], offset); + w[48] = amd_bytealign (w[46], w[47], offset); + w[47] = amd_bytealign (w[45], w[46], offset); + w[46] = amd_bytealign (w[44], w[45], offset); + w[45] = amd_bytealign (w[43], w[44], offset); + w[44] = amd_bytealign (w[42], w[43], offset); + w[43] = amd_bytealign (w[41], w[42], offset); + w[42] = amd_bytealign (w[40], w[41], offset); + w[41] = amd_bytealign (w[39], w[40], offset); + w[40] = amd_bytealign (w[38], w[39], offset); + w[39] = amd_bytealign (w[37], w[38], offset); + w[38] = amd_bytealign (w[36], w[37], offset); + w[37] = amd_bytealign (w[35], w[36], offset); + w[36] = amd_bytealign (w[34], w[35], offset); + w[35] = amd_bytealign (w[33], w[34], offset); + w[34] = amd_bytealign (w[32], w[33], offset); + w[33] = amd_bytealign (w[31], w[32], offset); + w[32] = amd_bytealign (w[30], w[31], offset); + w[31] = amd_bytealign (w[29], w[30], offset); + w[30] = amd_bytealign (w[28], w[29], offset); + w[29] = amd_bytealign (w[27], w[28], offset); + w[28] = amd_bytealign (w[26], w[27], offset); + w[27] = amd_bytealign (w[25], w[26], offset); + w[26] = amd_bytealign (w[24], w[25], offset); + w[25] = amd_bytealign (w[23], w[24], offset); + w[24] = amd_bytealign (w[22], w[23], offset); + w[23] = amd_bytealign (w[21], w[22], offset); + w[22] = amd_bytealign (w[20], w[21], offset); + w[21] = amd_bytealign (w[19], w[20], offset); + w[20] = amd_bytealign (w[18], w[19], offset); + w[19] = amd_bytealign (w[17], w[18], offset); + w[18] = amd_bytealign (w[16], w[17], offset); + w[17] = amd_bytealign (w[15], w[16], offset); + w[16] = amd_bytealign (w[14], w[15], offset); + w[15] = amd_bytealign (w[13], w[14], offset); + w[14] = amd_bytealign (w[12], w[13], offset); + w[13] = amd_bytealign (w[11], w[12], offset); + w[12] = amd_bytealign (w[10], w[11], offset); + w[11] = amd_bytealign (w[ 9], w[10], offset); + w[10] = amd_bytealign (w[ 8], w[ 9], offset); + w[ 9] = amd_bytealign (w[ 7], w[ 8], offset); + w[ 8] = amd_bytealign (w[ 6], w[ 7], offset); + w[ 7] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 6] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 5] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 4] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 3] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 2] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 1] = amd_bytealign ( 0, w[ 0], offset); + w[ 0] = 0; + + break; + + case 2: + w[63] = amd_bytealign (w[60], w[61], offset); + w[62] = amd_bytealign (w[59], w[60], offset); + w[61] = amd_bytealign (w[58], w[59], offset); + w[60] = amd_bytealign (w[57], w[58], offset); + w[59] = amd_bytealign (w[56], w[57], offset); + w[58] = amd_bytealign (w[55], w[56], offset); + w[57] = amd_bytealign (w[54], w[55], offset); + w[56] = amd_bytealign (w[53], w[54], offset); + w[55] = amd_bytealign (w[52], w[53], offset); + w[54] = amd_bytealign (w[51], w[52], offset); + w[53] = amd_bytealign (w[50], w[51], offset); + w[52] = amd_bytealign (w[49], w[50], offset); + w[51] = amd_bytealign (w[48], w[49], offset); + w[50] = amd_bytealign (w[47], w[48], offset); + w[49] = amd_bytealign (w[46], w[47], offset); + w[48] = amd_bytealign (w[45], w[46], offset); + w[47] = amd_bytealign (w[44], w[45], offset); + w[46] = amd_bytealign (w[43], w[44], offset); + w[45] = amd_bytealign (w[42], w[43], offset); + w[44] = amd_bytealign (w[41], w[42], offset); + w[43] = amd_bytealign (w[40], w[41], offset); + w[42] = amd_bytealign (w[39], w[40], offset); + w[41] = amd_bytealign (w[38], w[39], offset); + w[40] = amd_bytealign (w[37], w[38], offset); + w[39] = amd_bytealign (w[36], w[37], offset); + w[38] = amd_bytealign (w[35], w[36], offset); + w[37] = amd_bytealign (w[34], w[35], offset); + w[36] = amd_bytealign (w[33], w[34], offset); + w[35] = amd_bytealign (w[32], w[33], offset); + w[34] = amd_bytealign (w[31], w[32], offset); + w[33] = amd_bytealign (w[30], w[31], offset); + w[32] = amd_bytealign (w[29], w[30], offset); + w[31] = amd_bytealign (w[28], w[29], offset); + w[30] = amd_bytealign (w[27], w[28], offset); + w[29] = amd_bytealign (w[26], w[27], offset); + w[28] = amd_bytealign (w[25], w[26], offset); + w[27] = amd_bytealign (w[24], w[25], offset); + w[26] = amd_bytealign (w[23], w[24], offset); + w[25] = amd_bytealign (w[22], w[23], offset); + w[24] = amd_bytealign (w[21], w[22], offset); + w[23] = amd_bytealign (w[20], w[21], offset); + w[22] = amd_bytealign (w[19], w[20], offset); + w[21] = amd_bytealign (w[18], w[19], offset); + w[20] = amd_bytealign (w[17], w[18], offset); + w[19] = amd_bytealign (w[16], w[17], offset); + w[18] = amd_bytealign (w[15], w[16], offset); + w[17] = amd_bytealign (w[14], w[15], offset); + w[16] = amd_bytealign (w[13], w[14], offset); + w[15] = amd_bytealign (w[12], w[13], offset); + w[14] = amd_bytealign (w[11], w[12], offset); + w[13] = amd_bytealign (w[10], w[11], offset); + w[12] = amd_bytealign (w[ 9], w[10], offset); + w[11] = amd_bytealign (w[ 8], w[ 9], offset); + w[10] = amd_bytealign (w[ 7], w[ 8], offset); + w[ 9] = amd_bytealign (w[ 6], w[ 7], offset); + w[ 8] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 7] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 6] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 5] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 4] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 3] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 2] = amd_bytealign ( 0, w[ 0], offset); + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 3: + w[63] = amd_bytealign (w[59], w[60], offset); + w[62] = amd_bytealign (w[58], w[59], offset); + w[61] = amd_bytealign (w[57], w[58], offset); + w[60] = amd_bytealign (w[56], w[57], offset); + w[59] = amd_bytealign (w[55], w[56], offset); + w[58] = amd_bytealign (w[54], w[55], offset); + w[57] = amd_bytealign (w[53], w[54], offset); + w[56] = amd_bytealign (w[52], w[53], offset); + w[55] = amd_bytealign (w[51], w[52], offset); + w[54] = amd_bytealign (w[50], w[51], offset); + w[53] = amd_bytealign (w[49], w[50], offset); + w[52] = amd_bytealign (w[48], w[49], offset); + w[51] = amd_bytealign (w[47], w[48], offset); + w[50] = amd_bytealign (w[46], w[47], offset); + w[49] = amd_bytealign (w[45], w[46], offset); + w[48] = amd_bytealign (w[44], w[45], offset); + w[47] = amd_bytealign (w[43], w[44], offset); + w[46] = amd_bytealign (w[42], w[43], offset); + w[45] = amd_bytealign (w[41], w[42], offset); + w[44] = amd_bytealign (w[40], w[41], offset); + w[43] = amd_bytealign (w[39], w[40], offset); + w[42] = amd_bytealign (w[38], w[39], offset); + w[41] = amd_bytealign (w[37], w[38], offset); + w[40] = amd_bytealign (w[36], w[37], offset); + w[39] = amd_bytealign (w[35], w[36], offset); + w[38] = amd_bytealign (w[34], w[35], offset); + w[37] = amd_bytealign (w[33], w[34], offset); + w[36] = amd_bytealign (w[32], w[33], offset); + w[35] = amd_bytealign (w[31], w[32], offset); + w[34] = amd_bytealign (w[30], w[31], offset); + w[33] = amd_bytealign (w[29], w[30], offset); + w[32] = amd_bytealign (w[28], w[29], offset); + w[31] = amd_bytealign (w[27], w[28], offset); + w[30] = amd_bytealign (w[26], w[27], offset); + w[29] = amd_bytealign (w[25], w[26], offset); + w[28] = amd_bytealign (w[24], w[25], offset); + w[27] = amd_bytealign (w[23], w[24], offset); + w[26] = amd_bytealign (w[22], w[23], offset); + w[25] = amd_bytealign (w[21], w[22], offset); + w[24] = amd_bytealign (w[20], w[21], offset); + w[23] = amd_bytealign (w[19], w[20], offset); + w[22] = amd_bytealign (w[18], w[19], offset); + w[21] = amd_bytealign (w[17], w[18], offset); + w[20] = amd_bytealign (w[16], w[17], offset); + w[19] = amd_bytealign (w[15], w[16], offset); + w[18] = amd_bytealign (w[14], w[15], offset); + w[17] = amd_bytealign (w[13], w[14], offset); + w[16] = amd_bytealign (w[12], w[13], offset); + w[15] = amd_bytealign (w[11], w[12], offset); + w[14] = amd_bytealign (w[10], w[11], offset); + w[13] = amd_bytealign (w[ 9], w[10], offset); + w[12] = amd_bytealign (w[ 8], w[ 9], offset); + w[11] = amd_bytealign (w[ 7], w[ 8], offset); + w[10] = amd_bytealign (w[ 6], w[ 7], offset); + w[ 9] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 8] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 7] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 6] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 5] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 4] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 3] = amd_bytealign ( 0, w[ 0], offset); + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 4: + w[63] = amd_bytealign (w[58], w[59], offset); + w[62] = amd_bytealign (w[57], w[58], offset); + w[61] = amd_bytealign (w[56], w[57], offset); + w[60] = amd_bytealign (w[55], w[56], offset); + w[59] = amd_bytealign (w[54], w[55], offset); + w[58] = amd_bytealign (w[53], w[54], offset); + w[57] = amd_bytealign (w[52], w[53], offset); + w[56] = amd_bytealign (w[51], w[52], offset); + w[55] = amd_bytealign (w[50], w[51], offset); + w[54] = amd_bytealign (w[49], w[50], offset); + w[53] = amd_bytealign (w[48], w[49], offset); + w[52] = amd_bytealign (w[47], w[48], offset); + w[51] = amd_bytealign (w[46], w[47], offset); + w[50] = amd_bytealign (w[45], w[46], offset); + w[49] = amd_bytealign (w[44], w[45], offset); + w[48] = amd_bytealign (w[43], w[44], offset); + w[47] = amd_bytealign (w[42], w[43], offset); + w[46] = amd_bytealign (w[41], w[42], offset); + w[45] = amd_bytealign (w[40], w[41], offset); + w[44] = amd_bytealign (w[39], w[40], offset); + w[43] = amd_bytealign (w[38], w[39], offset); + w[42] = amd_bytealign (w[37], w[38], offset); + w[41] = amd_bytealign (w[36], w[37], offset); + w[40] = amd_bytealign (w[35], w[36], offset); + w[39] = amd_bytealign (w[34], w[35], offset); + w[38] = amd_bytealign (w[33], w[34], offset); + w[37] = amd_bytealign (w[32], w[33], offset); + w[36] = amd_bytealign (w[31], w[32], offset); + w[35] = amd_bytealign (w[30], w[31], offset); + w[34] = amd_bytealign (w[29], w[30], offset); + w[33] = amd_bytealign (w[28], w[29], offset); + w[32] = amd_bytealign (w[27], w[28], offset); + w[31] = amd_bytealign (w[26], w[27], offset); + w[30] = amd_bytealign (w[25], w[26], offset); + w[29] = amd_bytealign (w[24], w[25], offset); + w[28] = amd_bytealign (w[23], w[24], offset); + w[27] = amd_bytealign (w[22], w[23], offset); + w[26] = amd_bytealign (w[21], w[22], offset); + w[25] = amd_bytealign (w[20], w[21], offset); + w[24] = amd_bytealign (w[19], w[20], offset); + w[23] = amd_bytealign (w[18], w[19], offset); + w[22] = amd_bytealign (w[17], w[18], offset); + w[21] = amd_bytealign (w[16], w[17], offset); + w[20] = amd_bytealign (w[15], w[16], offset); + w[19] = amd_bytealign (w[14], w[15], offset); + w[18] = amd_bytealign (w[13], w[14], offset); + w[17] = amd_bytealign (w[12], w[13], offset); + w[16] = amd_bytealign (w[11], w[12], offset); + w[15] = amd_bytealign (w[10], w[11], offset); + w[14] = amd_bytealign (w[ 9], w[10], offset); + w[13] = amd_bytealign (w[ 8], w[ 9], offset); + w[12] = amd_bytealign (w[ 7], w[ 8], offset); + w[11] = amd_bytealign (w[ 6], w[ 7], offset); + w[10] = amd_bytealign (w[ 5], w[ 6], offset); + w[ 9] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 8] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 7] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 6] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 5] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 4] = amd_bytealign ( 0, w[ 0], offset); + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 5: + w[63] = amd_bytealign (w[57], w[58], offset); + w[62] = amd_bytealign (w[56], w[57], offset); + w[61] = amd_bytealign (w[55], w[56], offset); + w[60] = amd_bytealign (w[54], w[55], offset); + w[59] = amd_bytealign (w[53], w[54], offset); + w[58] = amd_bytealign (w[52], w[53], offset); + w[57] = amd_bytealign (w[51], w[52], offset); + w[56] = amd_bytealign (w[50], w[51], offset); + w[55] = amd_bytealign (w[49], w[50], offset); + w[54] = amd_bytealign (w[48], w[49], offset); + w[53] = amd_bytealign (w[47], w[48], offset); + w[52] = amd_bytealign (w[46], w[47], offset); + w[51] = amd_bytealign (w[45], w[46], offset); + w[50] = amd_bytealign (w[44], w[45], offset); + w[49] = amd_bytealign (w[43], w[44], offset); + w[48] = amd_bytealign (w[42], w[43], offset); + w[47] = amd_bytealign (w[41], w[42], offset); + w[46] = amd_bytealign (w[40], w[41], offset); + w[45] = amd_bytealign (w[39], w[40], offset); + w[44] = amd_bytealign (w[38], w[39], offset); + w[43] = amd_bytealign (w[37], w[38], offset); + w[42] = amd_bytealign (w[36], w[37], offset); + w[41] = amd_bytealign (w[35], w[36], offset); + w[40] = amd_bytealign (w[34], w[35], offset); + w[39] = amd_bytealign (w[33], w[34], offset); + w[38] = amd_bytealign (w[32], w[33], offset); + w[37] = amd_bytealign (w[31], w[32], offset); + w[36] = amd_bytealign (w[30], w[31], offset); + w[35] = amd_bytealign (w[29], w[30], offset); + w[34] = amd_bytealign (w[28], w[29], offset); + w[33] = amd_bytealign (w[27], w[28], offset); + w[32] = amd_bytealign (w[26], w[27], offset); + w[31] = amd_bytealign (w[25], w[26], offset); + w[30] = amd_bytealign (w[24], w[25], offset); + w[29] = amd_bytealign (w[23], w[24], offset); + w[28] = amd_bytealign (w[22], w[23], offset); + w[27] = amd_bytealign (w[21], w[22], offset); + w[26] = amd_bytealign (w[20], w[21], offset); + w[25] = amd_bytealign (w[19], w[20], offset); + w[24] = amd_bytealign (w[18], w[19], offset); + w[23] = amd_bytealign (w[17], w[18], offset); + w[22] = amd_bytealign (w[16], w[17], offset); + w[21] = amd_bytealign (w[15], w[16], offset); + w[20] = amd_bytealign (w[14], w[15], offset); + w[19] = amd_bytealign (w[13], w[14], offset); + w[18] = amd_bytealign (w[12], w[13], offset); + w[17] = amd_bytealign (w[11], w[12], offset); + w[16] = amd_bytealign (w[10], w[11], offset); + w[15] = amd_bytealign (w[ 9], w[10], offset); + w[14] = amd_bytealign (w[ 8], w[ 9], offset); + w[13] = amd_bytealign (w[ 7], w[ 8], offset); + w[12] = amd_bytealign (w[ 6], w[ 7], offset); + w[11] = amd_bytealign (w[ 5], w[ 6], offset); + w[10] = amd_bytealign (w[ 4], w[ 5], offset); + w[ 9] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 8] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 7] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 6] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 5] = amd_bytealign ( 0, w[ 0], offset); + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 6: + w[63] = amd_bytealign (w[56], w[57], offset); + w[62] = amd_bytealign (w[55], w[56], offset); + w[61] = amd_bytealign (w[54], w[55], offset); + w[60] = amd_bytealign (w[53], w[54], offset); + w[59] = amd_bytealign (w[52], w[53], offset); + w[58] = amd_bytealign (w[51], w[52], offset); + w[57] = amd_bytealign (w[50], w[51], offset); + w[56] = amd_bytealign (w[49], w[50], offset); + w[55] = amd_bytealign (w[48], w[49], offset); + w[54] = amd_bytealign (w[47], w[48], offset); + w[53] = amd_bytealign (w[46], w[47], offset); + w[52] = amd_bytealign (w[45], w[46], offset); + w[51] = amd_bytealign (w[44], w[45], offset); + w[50] = amd_bytealign (w[43], w[44], offset); + w[49] = amd_bytealign (w[42], w[43], offset); + w[48] = amd_bytealign (w[41], w[42], offset); + w[47] = amd_bytealign (w[40], w[41], offset); + w[46] = amd_bytealign (w[39], w[40], offset); + w[45] = amd_bytealign (w[38], w[39], offset); + w[44] = amd_bytealign (w[37], w[38], offset); + w[43] = amd_bytealign (w[36], w[37], offset); + w[42] = amd_bytealign (w[35], w[36], offset); + w[41] = amd_bytealign (w[34], w[35], offset); + w[40] = amd_bytealign (w[33], w[34], offset); + w[39] = amd_bytealign (w[32], w[33], offset); + w[38] = amd_bytealign (w[31], w[32], offset); + w[37] = amd_bytealign (w[30], w[31], offset); + w[36] = amd_bytealign (w[29], w[30], offset); + w[35] = amd_bytealign (w[28], w[29], offset); + w[34] = amd_bytealign (w[27], w[28], offset); + w[33] = amd_bytealign (w[26], w[27], offset); + w[32] = amd_bytealign (w[25], w[26], offset); + w[31] = amd_bytealign (w[24], w[25], offset); + w[30] = amd_bytealign (w[23], w[24], offset); + w[29] = amd_bytealign (w[22], w[23], offset); + w[28] = amd_bytealign (w[21], w[22], offset); + w[27] = amd_bytealign (w[20], w[21], offset); + w[26] = amd_bytealign (w[19], w[20], offset); + w[25] = amd_bytealign (w[18], w[19], offset); + w[24] = amd_bytealign (w[17], w[18], offset); + w[23] = amd_bytealign (w[16], w[17], offset); + w[22] = amd_bytealign (w[15], w[16], offset); + w[21] = amd_bytealign (w[14], w[15], offset); + w[20] = amd_bytealign (w[13], w[14], offset); + w[19] = amd_bytealign (w[12], w[13], offset); + w[18] = amd_bytealign (w[11], w[12], offset); + w[17] = amd_bytealign (w[10], w[11], offset); + w[16] = amd_bytealign (w[ 9], w[10], offset); + w[15] = amd_bytealign (w[ 8], w[ 9], offset); + w[14] = amd_bytealign (w[ 7], w[ 8], offset); + w[13] = amd_bytealign (w[ 6], w[ 7], offset); + w[12] = amd_bytealign (w[ 5], w[ 6], offset); + w[11] = amd_bytealign (w[ 4], w[ 5], offset); + w[10] = amd_bytealign (w[ 3], w[ 4], offset); + w[ 9] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 8] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 7] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 6] = amd_bytealign ( 0, w[ 0], offset); + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 7: + w[63] = amd_bytealign (w[55], w[56], offset); + w[62] = amd_bytealign (w[54], w[55], offset); + w[61] = amd_bytealign (w[53], w[54], offset); + w[60] = amd_bytealign (w[52], w[53], offset); + w[59] = amd_bytealign (w[51], w[52], offset); + w[58] = amd_bytealign (w[50], w[51], offset); + w[57] = amd_bytealign (w[49], w[50], offset); + w[56] = amd_bytealign (w[48], w[49], offset); + w[55] = amd_bytealign (w[47], w[48], offset); + w[54] = amd_bytealign (w[46], w[47], offset); + w[53] = amd_bytealign (w[45], w[46], offset); + w[52] = amd_bytealign (w[44], w[45], offset); + w[51] = amd_bytealign (w[43], w[44], offset); + w[50] = amd_bytealign (w[42], w[43], offset); + w[49] = amd_bytealign (w[41], w[42], offset); + w[48] = amd_bytealign (w[40], w[41], offset); + w[47] = amd_bytealign (w[39], w[40], offset); + w[46] = amd_bytealign (w[38], w[39], offset); + w[45] = amd_bytealign (w[37], w[38], offset); + w[44] = amd_bytealign (w[36], w[37], offset); + w[43] = amd_bytealign (w[35], w[36], offset); + w[42] = amd_bytealign (w[34], w[35], offset); + w[41] = amd_bytealign (w[33], w[34], offset); + w[40] = amd_bytealign (w[32], w[33], offset); + w[39] = amd_bytealign (w[31], w[32], offset); + w[38] = amd_bytealign (w[30], w[31], offset); + w[37] = amd_bytealign (w[29], w[30], offset); + w[36] = amd_bytealign (w[28], w[29], offset); + w[35] = amd_bytealign (w[27], w[28], offset); + w[34] = amd_bytealign (w[26], w[27], offset); + w[33] = amd_bytealign (w[25], w[26], offset); + w[32] = amd_bytealign (w[24], w[25], offset); + w[31] = amd_bytealign (w[23], w[24], offset); + w[30] = amd_bytealign (w[22], w[23], offset); + w[29] = amd_bytealign (w[21], w[22], offset); + w[28] = amd_bytealign (w[20], w[21], offset); + w[27] = amd_bytealign (w[19], w[20], offset); + w[26] = amd_bytealign (w[18], w[19], offset); + w[25] = amd_bytealign (w[17], w[18], offset); + w[24] = amd_bytealign (w[16], w[17], offset); + w[23] = amd_bytealign (w[15], w[16], offset); + w[22] = amd_bytealign (w[14], w[15], offset); + w[21] = amd_bytealign (w[13], w[14], offset); + w[20] = amd_bytealign (w[12], w[13], offset); + w[19] = amd_bytealign (w[11], w[12], offset); + w[18] = amd_bytealign (w[10], w[11], offset); + w[17] = amd_bytealign (w[ 9], w[10], offset); + w[16] = amd_bytealign (w[ 8], w[ 9], offset); + w[15] = amd_bytealign (w[ 7], w[ 8], offset); + w[14] = amd_bytealign (w[ 6], w[ 7], offset); + w[13] = amd_bytealign (w[ 5], w[ 6], offset); + w[12] = amd_bytealign (w[ 4], w[ 5], offset); + w[11] = amd_bytealign (w[ 3], w[ 4], offset); + w[10] = amd_bytealign (w[ 2], w[ 3], offset); + w[ 9] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 8] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 7] = amd_bytealign ( 0, w[ 0], offset); + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 8: + w[63] = amd_bytealign (w[54], w[55], offset); + w[62] = amd_bytealign (w[53], w[54], offset); + w[61] = amd_bytealign (w[52], w[53], offset); + w[60] = amd_bytealign (w[51], w[52], offset); + w[59] = amd_bytealign (w[50], w[51], offset); + w[58] = amd_bytealign (w[49], w[50], offset); + w[57] = amd_bytealign (w[48], w[49], offset); + w[56] = amd_bytealign (w[47], w[48], offset); + w[55] = amd_bytealign (w[46], w[47], offset); + w[54] = amd_bytealign (w[45], w[46], offset); + w[53] = amd_bytealign (w[44], w[45], offset); + w[52] = amd_bytealign (w[43], w[44], offset); + w[51] = amd_bytealign (w[42], w[43], offset); + w[50] = amd_bytealign (w[41], w[42], offset); + w[49] = amd_bytealign (w[40], w[41], offset); + w[48] = amd_bytealign (w[39], w[40], offset); + w[47] = amd_bytealign (w[38], w[39], offset); + w[46] = amd_bytealign (w[37], w[38], offset); + w[45] = amd_bytealign (w[36], w[37], offset); + w[44] = amd_bytealign (w[35], w[36], offset); + w[43] = amd_bytealign (w[34], w[35], offset); + w[42] = amd_bytealign (w[33], w[34], offset); + w[41] = amd_bytealign (w[32], w[33], offset); + w[40] = amd_bytealign (w[31], w[32], offset); + w[39] = amd_bytealign (w[30], w[31], offset); + w[38] = amd_bytealign (w[29], w[30], offset); + w[37] = amd_bytealign (w[28], w[29], offset); + w[36] = amd_bytealign (w[27], w[28], offset); + w[35] = amd_bytealign (w[26], w[27], offset); + w[34] = amd_bytealign (w[25], w[26], offset); + w[33] = amd_bytealign (w[24], w[25], offset); + w[32] = amd_bytealign (w[23], w[24], offset); + w[31] = amd_bytealign (w[22], w[23], offset); + w[30] = amd_bytealign (w[21], w[22], offset); + w[29] = amd_bytealign (w[20], w[21], offset); + w[28] = amd_bytealign (w[19], w[20], offset); + w[27] = amd_bytealign (w[18], w[19], offset); + w[26] = amd_bytealign (w[17], w[18], offset); + w[25] = amd_bytealign (w[16], w[17], offset); + w[24] = amd_bytealign (w[15], w[16], offset); + w[23] = amd_bytealign (w[14], w[15], offset); + w[22] = amd_bytealign (w[13], w[14], offset); + w[21] = amd_bytealign (w[12], w[13], offset); + w[20] = amd_bytealign (w[11], w[12], offset); + w[19] = amd_bytealign (w[10], w[11], offset); + w[18] = amd_bytealign (w[ 9], w[10], offset); + w[17] = amd_bytealign (w[ 8], w[ 9], offset); + w[16] = amd_bytealign (w[ 7], w[ 8], offset); + w[15] = amd_bytealign (w[ 6], w[ 7], offset); + w[14] = amd_bytealign (w[ 5], w[ 6], offset); + w[13] = amd_bytealign (w[ 4], w[ 5], offset); + w[12] = amd_bytealign (w[ 3], w[ 4], offset); + w[11] = amd_bytealign (w[ 2], w[ 3], offset); + w[10] = amd_bytealign (w[ 1], w[ 2], offset); + w[ 9] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 8] = amd_bytealign ( 0, w[ 0], offset); + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 9: + w[63] = amd_bytealign (w[53], w[54], offset); + w[62] = amd_bytealign (w[52], w[53], offset); + w[61] = amd_bytealign (w[51], w[52], offset); + w[60] = amd_bytealign (w[50], w[51], offset); + w[59] = amd_bytealign (w[49], w[50], offset); + w[58] = amd_bytealign (w[48], w[49], offset); + w[57] = amd_bytealign (w[47], w[48], offset); + w[56] = amd_bytealign (w[46], w[47], offset); + w[55] = amd_bytealign (w[45], w[46], offset); + w[54] = amd_bytealign (w[44], w[45], offset); + w[53] = amd_bytealign (w[43], w[44], offset); + w[52] = amd_bytealign (w[42], w[43], offset); + w[51] = amd_bytealign (w[41], w[42], offset); + w[50] = amd_bytealign (w[40], w[41], offset); + w[49] = amd_bytealign (w[39], w[40], offset); + w[48] = amd_bytealign (w[38], w[39], offset); + w[47] = amd_bytealign (w[37], w[38], offset); + w[46] = amd_bytealign (w[36], w[37], offset); + w[45] = amd_bytealign (w[35], w[36], offset); + w[44] = amd_bytealign (w[34], w[35], offset); + w[43] = amd_bytealign (w[33], w[34], offset); + w[42] = amd_bytealign (w[32], w[33], offset); + w[41] = amd_bytealign (w[31], w[32], offset); + w[40] = amd_bytealign (w[30], w[31], offset); + w[39] = amd_bytealign (w[29], w[30], offset); + w[38] = amd_bytealign (w[28], w[29], offset); + w[37] = amd_bytealign (w[27], w[28], offset); + w[36] = amd_bytealign (w[26], w[27], offset); + w[35] = amd_bytealign (w[25], w[26], offset); + w[34] = amd_bytealign (w[24], w[25], offset); + w[33] = amd_bytealign (w[23], w[24], offset); + w[32] = amd_bytealign (w[22], w[23], offset); + w[31] = amd_bytealign (w[21], w[22], offset); + w[30] = amd_bytealign (w[20], w[21], offset); + w[29] = amd_bytealign (w[19], w[20], offset); + w[28] = amd_bytealign (w[18], w[19], offset); + w[27] = amd_bytealign (w[17], w[18], offset); + w[26] = amd_bytealign (w[16], w[17], offset); + w[25] = amd_bytealign (w[15], w[16], offset); + w[24] = amd_bytealign (w[14], w[15], offset); + w[23] = amd_bytealign (w[13], w[14], offset); + w[22] = amd_bytealign (w[12], w[13], offset); + w[21] = amd_bytealign (w[11], w[12], offset); + w[20] = amd_bytealign (w[10], w[11], offset); + w[19] = amd_bytealign (w[ 9], w[10], offset); + w[18] = amd_bytealign (w[ 8], w[ 9], offset); + w[17] = amd_bytealign (w[ 7], w[ 8], offset); + w[16] = amd_bytealign (w[ 6], w[ 7], offset); + w[15] = amd_bytealign (w[ 5], w[ 6], offset); + w[14] = amd_bytealign (w[ 4], w[ 5], offset); + w[13] = amd_bytealign (w[ 3], w[ 4], offset); + w[12] = amd_bytealign (w[ 2], w[ 3], offset); + w[11] = amd_bytealign (w[ 1], w[ 2], offset); + w[10] = amd_bytealign (w[ 0], w[ 1], offset); + w[ 9] = amd_bytealign ( 0, w[ 0], offset); + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 10: + w[63] = amd_bytealign (w[52], w[53], offset); + w[62] = amd_bytealign (w[51], w[52], offset); + w[61] = amd_bytealign (w[50], w[51], offset); + w[60] = amd_bytealign (w[49], w[50], offset); + w[59] = amd_bytealign (w[48], w[49], offset); + w[58] = amd_bytealign (w[47], w[48], offset); + w[57] = amd_bytealign (w[46], w[47], offset); + w[56] = amd_bytealign (w[45], w[46], offset); + w[55] = amd_bytealign (w[44], w[45], offset); + w[54] = amd_bytealign (w[43], w[44], offset); + w[53] = amd_bytealign (w[42], w[43], offset); + w[52] = amd_bytealign (w[41], w[42], offset); + w[51] = amd_bytealign (w[40], w[41], offset); + w[50] = amd_bytealign (w[39], w[40], offset); + w[49] = amd_bytealign (w[38], w[39], offset); + w[48] = amd_bytealign (w[37], w[38], offset); + w[47] = amd_bytealign (w[36], w[37], offset); + w[46] = amd_bytealign (w[35], w[36], offset); + w[45] = amd_bytealign (w[34], w[35], offset); + w[44] = amd_bytealign (w[33], w[34], offset); + w[43] = amd_bytealign (w[32], w[33], offset); + w[42] = amd_bytealign (w[31], w[32], offset); + w[41] = amd_bytealign (w[30], w[31], offset); + w[40] = amd_bytealign (w[29], w[30], offset); + w[39] = amd_bytealign (w[28], w[29], offset); + w[38] = amd_bytealign (w[27], w[28], offset); + w[37] = amd_bytealign (w[26], w[27], offset); + w[36] = amd_bytealign (w[25], w[26], offset); + w[35] = amd_bytealign (w[24], w[25], offset); + w[34] = amd_bytealign (w[23], w[24], offset); + w[33] = amd_bytealign (w[22], w[23], offset); + w[32] = amd_bytealign (w[21], w[22], offset); + w[31] = amd_bytealign (w[20], w[21], offset); + w[30] = amd_bytealign (w[19], w[20], offset); + w[29] = amd_bytealign (w[18], w[19], offset); + w[28] = amd_bytealign (w[17], w[18], offset); + w[27] = amd_bytealign (w[16], w[17], offset); + w[26] = amd_bytealign (w[15], w[16], offset); + w[25] = amd_bytealign (w[14], w[15], offset); + w[24] = amd_bytealign (w[13], w[14], offset); + w[23] = amd_bytealign (w[12], w[13], offset); + w[22] = amd_bytealign (w[11], w[12], offset); + w[21] = amd_bytealign (w[10], w[11], offset); + w[20] = amd_bytealign (w[ 9], w[10], offset); + w[19] = amd_bytealign (w[ 8], w[ 9], offset); + w[18] = amd_bytealign (w[ 7], w[ 8], offset); + w[17] = amd_bytealign (w[ 6], w[ 7], offset); + w[16] = amd_bytealign (w[ 5], w[ 6], offset); + w[15] = amd_bytealign (w[ 4], w[ 5], offset); + w[14] = amd_bytealign (w[ 3], w[ 4], offset); + w[13] = amd_bytealign (w[ 2], w[ 3], offset); + w[12] = amd_bytealign (w[ 1], w[ 2], offset); + w[11] = amd_bytealign (w[ 0], w[ 1], offset); + w[10] = amd_bytealign ( 0, w[ 0], offset); + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 11: + w[63] = amd_bytealign (w[51], w[52], offset); + w[62] = amd_bytealign (w[50], w[51], offset); + w[61] = amd_bytealign (w[49], w[50], offset); + w[60] = amd_bytealign (w[48], w[49], offset); + w[59] = amd_bytealign (w[47], w[48], offset); + w[58] = amd_bytealign (w[46], w[47], offset); + w[57] = amd_bytealign (w[45], w[46], offset); + w[56] = amd_bytealign (w[44], w[45], offset); + w[55] = amd_bytealign (w[43], w[44], offset); + w[54] = amd_bytealign (w[42], w[43], offset); + w[53] = amd_bytealign (w[41], w[42], offset); + w[52] = amd_bytealign (w[40], w[41], offset); + w[51] = amd_bytealign (w[39], w[40], offset); + w[50] = amd_bytealign (w[38], w[39], offset); + w[49] = amd_bytealign (w[37], w[38], offset); + w[48] = amd_bytealign (w[36], w[37], offset); + w[47] = amd_bytealign (w[35], w[36], offset); + w[46] = amd_bytealign (w[34], w[35], offset); + w[45] = amd_bytealign (w[33], w[34], offset); + w[44] = amd_bytealign (w[32], w[33], offset); + w[43] = amd_bytealign (w[31], w[32], offset); + w[42] = amd_bytealign (w[30], w[31], offset); + w[41] = amd_bytealign (w[29], w[30], offset); + w[40] = amd_bytealign (w[28], w[29], offset); + w[39] = amd_bytealign (w[27], w[28], offset); + w[38] = amd_bytealign (w[26], w[27], offset); + w[37] = amd_bytealign (w[25], w[26], offset); + w[36] = amd_bytealign (w[24], w[25], offset); + w[35] = amd_bytealign (w[23], w[24], offset); + w[34] = amd_bytealign (w[22], w[23], offset); + w[33] = amd_bytealign (w[21], w[22], offset); + w[32] = amd_bytealign (w[20], w[21], offset); + w[31] = amd_bytealign (w[19], w[20], offset); + w[30] = amd_bytealign (w[18], w[19], offset); + w[29] = amd_bytealign (w[17], w[18], offset); + w[28] = amd_bytealign (w[16], w[17], offset); + w[27] = amd_bytealign (w[15], w[16], offset); + w[26] = amd_bytealign (w[14], w[15], offset); + w[25] = amd_bytealign (w[13], w[14], offset); + w[24] = amd_bytealign (w[12], w[13], offset); + w[23] = amd_bytealign (w[11], w[12], offset); + w[22] = amd_bytealign (w[10], w[11], offset); + w[21] = amd_bytealign (w[ 9], w[10], offset); + w[20] = amd_bytealign (w[ 8], w[ 9], offset); + w[19] = amd_bytealign (w[ 7], w[ 8], offset); + w[18] = amd_bytealign (w[ 6], w[ 7], offset); + w[17] = amd_bytealign (w[ 5], w[ 6], offset); + w[16] = amd_bytealign (w[ 4], w[ 5], offset); + w[15] = amd_bytealign (w[ 3], w[ 4], offset); + w[14] = amd_bytealign (w[ 2], w[ 3], offset); + w[13] = amd_bytealign (w[ 1], w[ 2], offset); + w[12] = amd_bytealign (w[ 0], w[ 1], offset); + w[11] = amd_bytealign ( 0, w[ 0], offset); + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 12: + w[63] = amd_bytealign (w[50], w[51], offset); + w[62] = amd_bytealign (w[49], w[50], offset); + w[61] = amd_bytealign (w[48], w[49], offset); + w[60] = amd_bytealign (w[47], w[48], offset); + w[59] = amd_bytealign (w[46], w[47], offset); + w[58] = amd_bytealign (w[45], w[46], offset); + w[57] = amd_bytealign (w[44], w[45], offset); + w[56] = amd_bytealign (w[43], w[44], offset); + w[55] = amd_bytealign (w[42], w[43], offset); + w[54] = amd_bytealign (w[41], w[42], offset); + w[53] = amd_bytealign (w[40], w[41], offset); + w[52] = amd_bytealign (w[39], w[40], offset); + w[51] = amd_bytealign (w[38], w[39], offset); + w[50] = amd_bytealign (w[37], w[38], offset); + w[49] = amd_bytealign (w[36], w[37], offset); + w[48] = amd_bytealign (w[35], w[36], offset); + w[47] = amd_bytealign (w[34], w[35], offset); + w[46] = amd_bytealign (w[33], w[34], offset); + w[45] = amd_bytealign (w[32], w[33], offset); + w[44] = amd_bytealign (w[31], w[32], offset); + w[43] = amd_bytealign (w[30], w[31], offset); + w[42] = amd_bytealign (w[29], w[30], offset); + w[41] = amd_bytealign (w[28], w[29], offset); + w[40] = amd_bytealign (w[27], w[28], offset); + w[39] = amd_bytealign (w[26], w[27], offset); + w[38] = amd_bytealign (w[25], w[26], offset); + w[37] = amd_bytealign (w[24], w[25], offset); + w[36] = amd_bytealign (w[23], w[24], offset); + w[35] = amd_bytealign (w[22], w[23], offset); + w[34] = amd_bytealign (w[21], w[22], offset); + w[33] = amd_bytealign (w[20], w[21], offset); + w[32] = amd_bytealign (w[19], w[20], offset); + w[31] = amd_bytealign (w[18], w[19], offset); + w[30] = amd_bytealign (w[17], w[18], offset); + w[29] = amd_bytealign (w[16], w[17], offset); + w[28] = amd_bytealign (w[15], w[16], offset); + w[27] = amd_bytealign (w[14], w[15], offset); + w[26] = amd_bytealign (w[13], w[14], offset); + w[25] = amd_bytealign (w[12], w[13], offset); + w[24] = amd_bytealign (w[11], w[12], offset); + w[23] = amd_bytealign (w[10], w[11], offset); + w[22] = amd_bytealign (w[ 9], w[10], offset); + w[21] = amd_bytealign (w[ 8], w[ 9], offset); + w[20] = amd_bytealign (w[ 7], w[ 8], offset); + w[19] = amd_bytealign (w[ 6], w[ 7], offset); + w[18] = amd_bytealign (w[ 5], w[ 6], offset); + w[17] = amd_bytealign (w[ 4], w[ 5], offset); + w[16] = amd_bytealign (w[ 3], w[ 4], offset); + w[15] = amd_bytealign (w[ 2], w[ 3], offset); + w[14] = amd_bytealign (w[ 1], w[ 2], offset); + w[13] = amd_bytealign (w[ 0], w[ 1], offset); + w[12] = amd_bytealign ( 0, w[ 0], offset); + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 13: + w[63] = amd_bytealign (w[49], w[50], offset); + w[62] = amd_bytealign (w[48], w[49], offset); + w[61] = amd_bytealign (w[47], w[48], offset); + w[60] = amd_bytealign (w[46], w[47], offset); + w[59] = amd_bytealign (w[45], w[46], offset); + w[58] = amd_bytealign (w[44], w[45], offset); + w[57] = amd_bytealign (w[43], w[44], offset); + w[56] = amd_bytealign (w[42], w[43], offset); + w[55] = amd_bytealign (w[41], w[42], offset); + w[54] = amd_bytealign (w[40], w[41], offset); + w[53] = amd_bytealign (w[39], w[40], offset); + w[52] = amd_bytealign (w[38], w[39], offset); + w[51] = amd_bytealign (w[37], w[38], offset); + w[50] = amd_bytealign (w[36], w[37], offset); + w[49] = amd_bytealign (w[35], w[36], offset); + w[48] = amd_bytealign (w[34], w[35], offset); + w[47] = amd_bytealign (w[33], w[34], offset); + w[46] = amd_bytealign (w[32], w[33], offset); + w[45] = amd_bytealign (w[31], w[32], offset); + w[44] = amd_bytealign (w[30], w[31], offset); + w[43] = amd_bytealign (w[29], w[30], offset); + w[42] = amd_bytealign (w[28], w[29], offset); + w[41] = amd_bytealign (w[27], w[28], offset); + w[40] = amd_bytealign (w[26], w[27], offset); + w[39] = amd_bytealign (w[25], w[26], offset); + w[38] = amd_bytealign (w[24], w[25], offset); + w[37] = amd_bytealign (w[23], w[24], offset); + w[36] = amd_bytealign (w[22], w[23], offset); + w[35] = amd_bytealign (w[21], w[22], offset); + w[34] = amd_bytealign (w[20], w[21], offset); + w[33] = amd_bytealign (w[19], w[20], offset); + w[32] = amd_bytealign (w[18], w[19], offset); + w[31] = amd_bytealign (w[17], w[18], offset); + w[30] = amd_bytealign (w[16], w[17], offset); + w[29] = amd_bytealign (w[15], w[16], offset); + w[28] = amd_bytealign (w[14], w[15], offset); + w[27] = amd_bytealign (w[13], w[14], offset); + w[26] = amd_bytealign (w[12], w[13], offset); + w[25] = amd_bytealign (w[11], w[12], offset); + w[24] = amd_bytealign (w[10], w[11], offset); + w[23] = amd_bytealign (w[ 9], w[10], offset); + w[22] = amd_bytealign (w[ 8], w[ 9], offset); + w[21] = amd_bytealign (w[ 7], w[ 8], offset); + w[20] = amd_bytealign (w[ 6], w[ 7], offset); + w[19] = amd_bytealign (w[ 5], w[ 6], offset); + w[18] = amd_bytealign (w[ 4], w[ 5], offset); + w[17] = amd_bytealign (w[ 3], w[ 4], offset); + w[16] = amd_bytealign (w[ 2], w[ 3], offset); + w[15] = amd_bytealign (w[ 1], w[ 2], offset); + w[14] = amd_bytealign (w[ 0], w[ 1], offset); + w[13] = amd_bytealign ( 0, w[ 0], offset); + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 14: + w[63] = amd_bytealign (w[48], w[49], offset); + w[62] = amd_bytealign (w[47], w[48], offset); + w[61] = amd_bytealign (w[46], w[47], offset); + w[60] = amd_bytealign (w[45], w[46], offset); + w[59] = amd_bytealign (w[44], w[45], offset); + w[58] = amd_bytealign (w[43], w[44], offset); + w[57] = amd_bytealign (w[42], w[43], offset); + w[56] = amd_bytealign (w[41], w[42], offset); + w[55] = amd_bytealign (w[40], w[41], offset); + w[54] = amd_bytealign (w[39], w[40], offset); + w[53] = amd_bytealign (w[38], w[39], offset); + w[52] = amd_bytealign (w[37], w[38], offset); + w[51] = amd_bytealign (w[36], w[37], offset); + w[50] = amd_bytealign (w[35], w[36], offset); + w[49] = amd_bytealign (w[34], w[35], offset); + w[48] = amd_bytealign (w[33], w[34], offset); + w[47] = amd_bytealign (w[32], w[33], offset); + w[46] = amd_bytealign (w[31], w[32], offset); + w[45] = amd_bytealign (w[30], w[31], offset); + w[44] = amd_bytealign (w[29], w[30], offset); + w[43] = amd_bytealign (w[28], w[29], offset); + w[42] = amd_bytealign (w[27], w[28], offset); + w[41] = amd_bytealign (w[26], w[27], offset); + w[40] = amd_bytealign (w[25], w[26], offset); + w[39] = amd_bytealign (w[24], w[25], offset); + w[38] = amd_bytealign (w[23], w[24], offset); + w[37] = amd_bytealign (w[22], w[23], offset); + w[36] = amd_bytealign (w[21], w[22], offset); + w[35] = amd_bytealign (w[20], w[21], offset); + w[34] = amd_bytealign (w[19], w[20], offset); + w[33] = amd_bytealign (w[18], w[19], offset); + w[32] = amd_bytealign (w[17], w[18], offset); + w[31] = amd_bytealign (w[16], w[17], offset); + w[30] = amd_bytealign (w[15], w[16], offset); + w[29] = amd_bytealign (w[14], w[15], offset); + w[28] = amd_bytealign (w[13], w[14], offset); + w[27] = amd_bytealign (w[12], w[13], offset); + w[26] = amd_bytealign (w[11], w[12], offset); + w[25] = amd_bytealign (w[10], w[11], offset); + w[24] = amd_bytealign (w[ 9], w[10], offset); + w[23] = amd_bytealign (w[ 8], w[ 9], offset); + w[22] = amd_bytealign (w[ 7], w[ 8], offset); + w[21] = amd_bytealign (w[ 6], w[ 7], offset); + w[20] = amd_bytealign (w[ 5], w[ 6], offset); + w[19] = amd_bytealign (w[ 4], w[ 5], offset); + w[18] = amd_bytealign (w[ 3], w[ 4], offset); + w[17] = amd_bytealign (w[ 2], w[ 3], offset); + w[16] = amd_bytealign (w[ 1], w[ 2], offset); + w[15] = amd_bytealign (w[ 0], w[ 1], offset); + w[14] = amd_bytealign ( 0, w[ 0], offset); + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 15: + w[63] = amd_bytealign (w[47], w[48], offset); + w[62] = amd_bytealign (w[46], w[47], offset); + w[61] = amd_bytealign (w[45], w[46], offset); + w[60] = amd_bytealign (w[44], w[45], offset); + w[59] = amd_bytealign (w[43], w[44], offset); + w[58] = amd_bytealign (w[42], w[43], offset); + w[57] = amd_bytealign (w[41], w[42], offset); + w[56] = amd_bytealign (w[40], w[41], offset); + w[55] = amd_bytealign (w[39], w[40], offset); + w[54] = amd_bytealign (w[38], w[39], offset); + w[53] = amd_bytealign (w[37], w[38], offset); + w[52] = amd_bytealign (w[36], w[37], offset); + w[51] = amd_bytealign (w[35], w[36], offset); + w[50] = amd_bytealign (w[34], w[35], offset); + w[49] = amd_bytealign (w[33], w[34], offset); + w[48] = amd_bytealign (w[32], w[33], offset); + w[47] = amd_bytealign (w[31], w[32], offset); + w[46] = amd_bytealign (w[30], w[31], offset); + w[45] = amd_bytealign (w[29], w[30], offset); + w[44] = amd_bytealign (w[28], w[29], offset); + w[43] = amd_bytealign (w[27], w[28], offset); + w[42] = amd_bytealign (w[26], w[27], offset); + w[41] = amd_bytealign (w[25], w[26], offset); + w[40] = amd_bytealign (w[24], w[25], offset); + w[39] = amd_bytealign (w[23], w[24], offset); + w[38] = amd_bytealign (w[22], w[23], offset); + w[37] = amd_bytealign (w[21], w[22], offset); + w[36] = amd_bytealign (w[20], w[21], offset); + w[35] = amd_bytealign (w[19], w[20], offset); + w[34] = amd_bytealign (w[18], w[19], offset); + w[33] = amd_bytealign (w[17], w[18], offset); + w[32] = amd_bytealign (w[16], w[17], offset); + w[31] = amd_bytealign (w[15], w[16], offset); + w[30] = amd_bytealign (w[14], w[15], offset); + w[29] = amd_bytealign (w[13], w[14], offset); + w[28] = amd_bytealign (w[12], w[13], offset); + w[27] = amd_bytealign (w[11], w[12], offset); + w[26] = amd_bytealign (w[10], w[11], offset); + w[25] = amd_bytealign (w[ 9], w[10], offset); + w[24] = amd_bytealign (w[ 8], w[ 9], offset); + w[23] = amd_bytealign (w[ 7], w[ 8], offset); + w[22] = amd_bytealign (w[ 6], w[ 7], offset); + w[21] = amd_bytealign (w[ 5], w[ 6], offset); + w[20] = amd_bytealign (w[ 4], w[ 5], offset); + w[19] = amd_bytealign (w[ 3], w[ 4], offset); + w[18] = amd_bytealign (w[ 2], w[ 3], offset); + w[17] = amd_bytealign (w[ 1], w[ 2], offset); + w[16] = amd_bytealign (w[ 0], w[ 1], offset); + w[15] = amd_bytealign ( 0, w[ 0], offset); + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 16: + w[63] = amd_bytealign (w[46], w[47], offset); + w[62] = amd_bytealign (w[45], w[46], offset); + w[61] = amd_bytealign (w[44], w[45], offset); + w[60] = amd_bytealign (w[43], w[44], offset); + w[59] = amd_bytealign (w[42], w[43], offset); + w[58] = amd_bytealign (w[41], w[42], offset); + w[57] = amd_bytealign (w[40], w[41], offset); + w[56] = amd_bytealign (w[39], w[40], offset); + w[55] = amd_bytealign (w[38], w[39], offset); + w[54] = amd_bytealign (w[37], w[38], offset); + w[53] = amd_bytealign (w[36], w[37], offset); + w[52] = amd_bytealign (w[35], w[36], offset); + w[51] = amd_bytealign (w[34], w[35], offset); + w[50] = amd_bytealign (w[33], w[34], offset); + w[49] = amd_bytealign (w[32], w[33], offset); + w[48] = amd_bytealign (w[31], w[32], offset); + w[47] = amd_bytealign (w[30], w[31], offset); + w[46] = amd_bytealign (w[29], w[30], offset); + w[45] = amd_bytealign (w[28], w[29], offset); + w[44] = amd_bytealign (w[27], w[28], offset); + w[43] = amd_bytealign (w[26], w[27], offset); + w[42] = amd_bytealign (w[25], w[26], offset); + w[41] = amd_bytealign (w[24], w[25], offset); + w[40] = amd_bytealign (w[23], w[24], offset); + w[39] = amd_bytealign (w[22], w[23], offset); + w[38] = amd_bytealign (w[21], w[22], offset); + w[37] = amd_bytealign (w[20], w[21], offset); + w[36] = amd_bytealign (w[19], w[20], offset); + w[35] = amd_bytealign (w[18], w[19], offset); + w[34] = amd_bytealign (w[17], w[18], offset); + w[33] = amd_bytealign (w[16], w[17], offset); + w[32] = amd_bytealign (w[15], w[16], offset); + w[31] = amd_bytealign (w[14], w[15], offset); + w[30] = amd_bytealign (w[13], w[14], offset); + w[29] = amd_bytealign (w[12], w[13], offset); + w[28] = amd_bytealign (w[11], w[12], offset); + w[27] = amd_bytealign (w[10], w[11], offset); + w[26] = amd_bytealign (w[ 9], w[10], offset); + w[25] = amd_bytealign (w[ 8], w[ 9], offset); + w[24] = amd_bytealign (w[ 7], w[ 8], offset); + w[23] = amd_bytealign (w[ 6], w[ 7], offset); + w[22] = amd_bytealign (w[ 5], w[ 6], offset); + w[21] = amd_bytealign (w[ 4], w[ 5], offset); + w[20] = amd_bytealign (w[ 3], w[ 4], offset); + w[19] = amd_bytealign (w[ 2], w[ 3], offset); + w[18] = amd_bytealign (w[ 1], w[ 2], offset); + w[17] = amd_bytealign (w[ 0], w[ 1], offset); + w[16] = amd_bytealign ( 0, w[ 0], offset); + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 17: + w[63] = amd_bytealign (w[45], w[46], offset); + w[62] = amd_bytealign (w[44], w[45], offset); + w[61] = amd_bytealign (w[43], w[44], offset); + w[60] = amd_bytealign (w[42], w[43], offset); + w[59] = amd_bytealign (w[41], w[42], offset); + w[58] = amd_bytealign (w[40], w[41], offset); + w[57] = amd_bytealign (w[39], w[40], offset); + w[56] = amd_bytealign (w[38], w[39], offset); + w[55] = amd_bytealign (w[37], w[38], offset); + w[54] = amd_bytealign (w[36], w[37], offset); + w[53] = amd_bytealign (w[35], w[36], offset); + w[52] = amd_bytealign (w[34], w[35], offset); + w[51] = amd_bytealign (w[33], w[34], offset); + w[50] = amd_bytealign (w[32], w[33], offset); + w[49] = amd_bytealign (w[31], w[32], offset); + w[48] = amd_bytealign (w[30], w[31], offset); + w[47] = amd_bytealign (w[29], w[30], offset); + w[46] = amd_bytealign (w[28], w[29], offset); + w[45] = amd_bytealign (w[27], w[28], offset); + w[44] = amd_bytealign (w[26], w[27], offset); + w[43] = amd_bytealign (w[25], w[26], offset); + w[42] = amd_bytealign (w[24], w[25], offset); + w[41] = amd_bytealign (w[23], w[24], offset); + w[40] = amd_bytealign (w[22], w[23], offset); + w[39] = amd_bytealign (w[21], w[22], offset); + w[38] = amd_bytealign (w[20], w[21], offset); + w[37] = amd_bytealign (w[19], w[20], offset); + w[36] = amd_bytealign (w[18], w[19], offset); + w[35] = amd_bytealign (w[17], w[18], offset); + w[34] = amd_bytealign (w[16], w[17], offset); + w[33] = amd_bytealign (w[15], w[16], offset); + w[32] = amd_bytealign (w[14], w[15], offset); + w[31] = amd_bytealign (w[13], w[14], offset); + w[30] = amd_bytealign (w[12], w[13], offset); + w[29] = amd_bytealign (w[11], w[12], offset); + w[28] = amd_bytealign (w[10], w[11], offset); + w[27] = amd_bytealign (w[ 9], w[10], offset); + w[26] = amd_bytealign (w[ 8], w[ 9], offset); + w[25] = amd_bytealign (w[ 7], w[ 8], offset); + w[24] = amd_bytealign (w[ 6], w[ 7], offset); + w[23] = amd_bytealign (w[ 5], w[ 6], offset); + w[22] = amd_bytealign (w[ 4], w[ 5], offset); + w[21] = amd_bytealign (w[ 3], w[ 4], offset); + w[20] = amd_bytealign (w[ 2], w[ 3], offset); + w[19] = amd_bytealign (w[ 1], w[ 2], offset); + w[18] = amd_bytealign (w[ 0], w[ 1], offset); + w[17] = amd_bytealign ( 0, w[ 0], offset); + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 18: + w[63] = amd_bytealign (w[44], w[45], offset); + w[62] = amd_bytealign (w[43], w[44], offset); + w[61] = amd_bytealign (w[42], w[43], offset); + w[60] = amd_bytealign (w[41], w[42], offset); + w[59] = amd_bytealign (w[40], w[41], offset); + w[58] = amd_bytealign (w[39], w[40], offset); + w[57] = amd_bytealign (w[38], w[39], offset); + w[56] = amd_bytealign (w[37], w[38], offset); + w[55] = amd_bytealign (w[36], w[37], offset); + w[54] = amd_bytealign (w[35], w[36], offset); + w[53] = amd_bytealign (w[34], w[35], offset); + w[52] = amd_bytealign (w[33], w[34], offset); + w[51] = amd_bytealign (w[32], w[33], offset); + w[50] = amd_bytealign (w[31], w[32], offset); + w[49] = amd_bytealign (w[30], w[31], offset); + w[48] = amd_bytealign (w[29], w[30], offset); + w[47] = amd_bytealign (w[28], w[29], offset); + w[46] = amd_bytealign (w[27], w[28], offset); + w[45] = amd_bytealign (w[26], w[27], offset); + w[44] = amd_bytealign (w[25], w[26], offset); + w[43] = amd_bytealign (w[24], w[25], offset); + w[42] = amd_bytealign (w[23], w[24], offset); + w[41] = amd_bytealign (w[22], w[23], offset); + w[40] = amd_bytealign (w[21], w[22], offset); + w[39] = amd_bytealign (w[20], w[21], offset); + w[38] = amd_bytealign (w[19], w[20], offset); + w[37] = amd_bytealign (w[18], w[19], offset); + w[36] = amd_bytealign (w[17], w[18], offset); + w[35] = amd_bytealign (w[16], w[17], offset); + w[34] = amd_bytealign (w[15], w[16], offset); + w[33] = amd_bytealign (w[14], w[15], offset); + w[32] = amd_bytealign (w[13], w[14], offset); + w[31] = amd_bytealign (w[12], w[13], offset); + w[30] = amd_bytealign (w[11], w[12], offset); + w[29] = amd_bytealign (w[10], w[11], offset); + w[28] = amd_bytealign (w[ 9], w[10], offset); + w[27] = amd_bytealign (w[ 8], w[ 9], offset); + w[26] = amd_bytealign (w[ 7], w[ 8], offset); + w[25] = amd_bytealign (w[ 6], w[ 7], offset); + w[24] = amd_bytealign (w[ 5], w[ 6], offset); + w[23] = amd_bytealign (w[ 4], w[ 5], offset); + w[22] = amd_bytealign (w[ 3], w[ 4], offset); + w[21] = amd_bytealign (w[ 2], w[ 3], offset); + w[20] = amd_bytealign (w[ 1], w[ 2], offset); + w[19] = amd_bytealign (w[ 0], w[ 1], offset); + w[18] = amd_bytealign ( 0, w[ 0], offset); + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 19: + w[63] = amd_bytealign (w[43], w[44], offset); + w[62] = amd_bytealign (w[42], w[43], offset); + w[61] = amd_bytealign (w[41], w[42], offset); + w[60] = amd_bytealign (w[40], w[41], offset); + w[59] = amd_bytealign (w[39], w[40], offset); + w[58] = amd_bytealign (w[38], w[39], offset); + w[57] = amd_bytealign (w[37], w[38], offset); + w[56] = amd_bytealign (w[36], w[37], offset); + w[55] = amd_bytealign (w[35], w[36], offset); + w[54] = amd_bytealign (w[34], w[35], offset); + w[53] = amd_bytealign (w[33], w[34], offset); + w[52] = amd_bytealign (w[32], w[33], offset); + w[51] = amd_bytealign (w[31], w[32], offset); + w[50] = amd_bytealign (w[30], w[31], offset); + w[49] = amd_bytealign (w[29], w[30], offset); + w[48] = amd_bytealign (w[28], w[29], offset); + w[47] = amd_bytealign (w[27], w[28], offset); + w[46] = amd_bytealign (w[26], w[27], offset); + w[45] = amd_bytealign (w[25], w[26], offset); + w[44] = amd_bytealign (w[24], w[25], offset); + w[43] = amd_bytealign (w[23], w[24], offset); + w[42] = amd_bytealign (w[22], w[23], offset); + w[41] = amd_bytealign (w[21], w[22], offset); + w[40] = amd_bytealign (w[20], w[21], offset); + w[39] = amd_bytealign (w[19], w[20], offset); + w[38] = amd_bytealign (w[18], w[19], offset); + w[37] = amd_bytealign (w[17], w[18], offset); + w[36] = amd_bytealign (w[16], w[17], offset); + w[35] = amd_bytealign (w[15], w[16], offset); + w[34] = amd_bytealign (w[14], w[15], offset); + w[33] = amd_bytealign (w[13], w[14], offset); + w[32] = amd_bytealign (w[12], w[13], offset); + w[31] = amd_bytealign (w[11], w[12], offset); + w[30] = amd_bytealign (w[10], w[11], offset); + w[29] = amd_bytealign (w[ 9], w[10], offset); + w[28] = amd_bytealign (w[ 8], w[ 9], offset); + w[27] = amd_bytealign (w[ 7], w[ 8], offset); + w[26] = amd_bytealign (w[ 6], w[ 7], offset); + w[25] = amd_bytealign (w[ 5], w[ 6], offset); + w[24] = amd_bytealign (w[ 4], w[ 5], offset); + w[23] = amd_bytealign (w[ 3], w[ 4], offset); + w[22] = amd_bytealign (w[ 2], w[ 3], offset); + w[21] = amd_bytealign (w[ 1], w[ 2], offset); + w[20] = amd_bytealign (w[ 0], w[ 1], offset); + w[19] = amd_bytealign ( 0, w[ 0], offset); + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 20: + w[63] = amd_bytealign (w[42], w[43], offset); + w[62] = amd_bytealign (w[41], w[42], offset); + w[61] = amd_bytealign (w[40], w[41], offset); + w[60] = amd_bytealign (w[39], w[40], offset); + w[59] = amd_bytealign (w[38], w[39], offset); + w[58] = amd_bytealign (w[37], w[38], offset); + w[57] = amd_bytealign (w[36], w[37], offset); + w[56] = amd_bytealign (w[35], w[36], offset); + w[55] = amd_bytealign (w[34], w[35], offset); + w[54] = amd_bytealign (w[33], w[34], offset); + w[53] = amd_bytealign (w[32], w[33], offset); + w[52] = amd_bytealign (w[31], w[32], offset); + w[51] = amd_bytealign (w[30], w[31], offset); + w[50] = amd_bytealign (w[29], w[30], offset); + w[49] = amd_bytealign (w[28], w[29], offset); + w[48] = amd_bytealign (w[27], w[28], offset); + w[47] = amd_bytealign (w[26], w[27], offset); + w[46] = amd_bytealign (w[25], w[26], offset); + w[45] = amd_bytealign (w[24], w[25], offset); + w[44] = amd_bytealign (w[23], w[24], offset); + w[43] = amd_bytealign (w[22], w[23], offset); + w[42] = amd_bytealign (w[21], w[22], offset); + w[41] = amd_bytealign (w[20], w[21], offset); + w[40] = amd_bytealign (w[19], w[20], offset); + w[39] = amd_bytealign (w[18], w[19], offset); + w[38] = amd_bytealign (w[17], w[18], offset); + w[37] = amd_bytealign (w[16], w[17], offset); + w[36] = amd_bytealign (w[15], w[16], offset); + w[35] = amd_bytealign (w[14], w[15], offset); + w[34] = amd_bytealign (w[13], w[14], offset); + w[33] = amd_bytealign (w[12], w[13], offset); + w[32] = amd_bytealign (w[11], w[12], offset); + w[31] = amd_bytealign (w[10], w[11], offset); + w[30] = amd_bytealign (w[ 9], w[10], offset); + w[29] = amd_bytealign (w[ 8], w[ 9], offset); + w[28] = amd_bytealign (w[ 7], w[ 8], offset); + w[27] = amd_bytealign (w[ 6], w[ 7], offset); + w[26] = amd_bytealign (w[ 5], w[ 6], offset); + w[25] = amd_bytealign (w[ 4], w[ 5], offset); + w[24] = amd_bytealign (w[ 3], w[ 4], offset); + w[23] = amd_bytealign (w[ 2], w[ 3], offset); + w[22] = amd_bytealign (w[ 1], w[ 2], offset); + w[21] = amd_bytealign (w[ 0], w[ 1], offset); + w[20] = amd_bytealign ( 0, w[ 0], offset); + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 21: + w[63] = amd_bytealign (w[41], w[42], offset); + w[62] = amd_bytealign (w[40], w[41], offset); + w[61] = amd_bytealign (w[39], w[40], offset); + w[60] = amd_bytealign (w[38], w[39], offset); + w[59] = amd_bytealign (w[37], w[38], offset); + w[58] = amd_bytealign (w[36], w[37], offset); + w[57] = amd_bytealign (w[35], w[36], offset); + w[56] = amd_bytealign (w[34], w[35], offset); + w[55] = amd_bytealign (w[33], w[34], offset); + w[54] = amd_bytealign (w[32], w[33], offset); + w[53] = amd_bytealign (w[31], w[32], offset); + w[52] = amd_bytealign (w[30], w[31], offset); + w[51] = amd_bytealign (w[29], w[30], offset); + w[50] = amd_bytealign (w[28], w[29], offset); + w[49] = amd_bytealign (w[27], w[28], offset); + w[48] = amd_bytealign (w[26], w[27], offset); + w[47] = amd_bytealign (w[25], w[26], offset); + w[46] = amd_bytealign (w[24], w[25], offset); + w[45] = amd_bytealign (w[23], w[24], offset); + w[44] = amd_bytealign (w[22], w[23], offset); + w[43] = amd_bytealign (w[21], w[22], offset); + w[42] = amd_bytealign (w[20], w[21], offset); + w[41] = amd_bytealign (w[19], w[20], offset); + w[40] = amd_bytealign (w[18], w[19], offset); + w[39] = amd_bytealign (w[17], w[18], offset); + w[38] = amd_bytealign (w[16], w[17], offset); + w[37] = amd_bytealign (w[15], w[16], offset); + w[36] = amd_bytealign (w[14], w[15], offset); + w[35] = amd_bytealign (w[13], w[14], offset); + w[34] = amd_bytealign (w[12], w[13], offset); + w[33] = amd_bytealign (w[11], w[12], offset); + w[32] = amd_bytealign (w[10], w[11], offset); + w[31] = amd_bytealign (w[ 9], w[10], offset); + w[30] = amd_bytealign (w[ 8], w[ 9], offset); + w[29] = amd_bytealign (w[ 7], w[ 8], offset); + w[28] = amd_bytealign (w[ 6], w[ 7], offset); + w[27] = amd_bytealign (w[ 5], w[ 6], offset); + w[26] = amd_bytealign (w[ 4], w[ 5], offset); + w[25] = amd_bytealign (w[ 3], w[ 4], offset); + w[24] = amd_bytealign (w[ 2], w[ 3], offset); + w[23] = amd_bytealign (w[ 1], w[ 2], offset); + w[22] = amd_bytealign (w[ 0], w[ 1], offset); + w[21] = amd_bytealign ( 0, w[ 0], offset); + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 22: + w[63] = amd_bytealign (w[40], w[41], offset); + w[62] = amd_bytealign (w[39], w[40], offset); + w[61] = amd_bytealign (w[38], w[39], offset); + w[60] = amd_bytealign (w[37], w[38], offset); + w[59] = amd_bytealign (w[36], w[37], offset); + w[58] = amd_bytealign (w[35], w[36], offset); + w[57] = amd_bytealign (w[34], w[35], offset); + w[56] = amd_bytealign (w[33], w[34], offset); + w[55] = amd_bytealign (w[32], w[33], offset); + w[54] = amd_bytealign (w[31], w[32], offset); + w[53] = amd_bytealign (w[30], w[31], offset); + w[52] = amd_bytealign (w[29], w[30], offset); + w[51] = amd_bytealign (w[28], w[29], offset); + w[50] = amd_bytealign (w[27], w[28], offset); + w[49] = amd_bytealign (w[26], w[27], offset); + w[48] = amd_bytealign (w[25], w[26], offset); + w[47] = amd_bytealign (w[24], w[25], offset); + w[46] = amd_bytealign (w[23], w[24], offset); + w[45] = amd_bytealign (w[22], w[23], offset); + w[44] = amd_bytealign (w[21], w[22], offset); + w[43] = amd_bytealign (w[20], w[21], offset); + w[42] = amd_bytealign (w[19], w[20], offset); + w[41] = amd_bytealign (w[18], w[19], offset); + w[40] = amd_bytealign (w[17], w[18], offset); + w[39] = amd_bytealign (w[16], w[17], offset); + w[38] = amd_bytealign (w[15], w[16], offset); + w[37] = amd_bytealign (w[14], w[15], offset); + w[36] = amd_bytealign (w[13], w[14], offset); + w[35] = amd_bytealign (w[12], w[13], offset); + w[34] = amd_bytealign (w[11], w[12], offset); + w[33] = amd_bytealign (w[10], w[11], offset); + w[32] = amd_bytealign (w[ 9], w[10], offset); + w[31] = amd_bytealign (w[ 8], w[ 9], offset); + w[30] = amd_bytealign (w[ 7], w[ 8], offset); + w[29] = amd_bytealign (w[ 6], w[ 7], offset); + w[28] = amd_bytealign (w[ 5], w[ 6], offset); + w[27] = amd_bytealign (w[ 4], w[ 5], offset); + w[26] = amd_bytealign (w[ 3], w[ 4], offset); + w[25] = amd_bytealign (w[ 2], w[ 3], offset); + w[24] = amd_bytealign (w[ 1], w[ 2], offset); + w[23] = amd_bytealign (w[ 0], w[ 1], offset); + w[22] = amd_bytealign ( 0, w[ 0], offset); + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 23: + w[63] = amd_bytealign (w[39], w[40], offset); + w[62] = amd_bytealign (w[38], w[39], offset); + w[61] = amd_bytealign (w[37], w[38], offset); + w[60] = amd_bytealign (w[36], w[37], offset); + w[59] = amd_bytealign (w[35], w[36], offset); + w[58] = amd_bytealign (w[34], w[35], offset); + w[57] = amd_bytealign (w[33], w[34], offset); + w[56] = amd_bytealign (w[32], w[33], offset); + w[55] = amd_bytealign (w[31], w[32], offset); + w[54] = amd_bytealign (w[30], w[31], offset); + w[53] = amd_bytealign (w[29], w[30], offset); + w[52] = amd_bytealign (w[28], w[29], offset); + w[51] = amd_bytealign (w[27], w[28], offset); + w[50] = amd_bytealign (w[26], w[27], offset); + w[49] = amd_bytealign (w[25], w[26], offset); + w[48] = amd_bytealign (w[24], w[25], offset); + w[47] = amd_bytealign (w[23], w[24], offset); + w[46] = amd_bytealign (w[22], w[23], offset); + w[45] = amd_bytealign (w[21], w[22], offset); + w[44] = amd_bytealign (w[20], w[21], offset); + w[43] = amd_bytealign (w[19], w[20], offset); + w[42] = amd_bytealign (w[18], w[19], offset); + w[41] = amd_bytealign (w[17], w[18], offset); + w[40] = amd_bytealign (w[16], w[17], offset); + w[39] = amd_bytealign (w[15], w[16], offset); + w[38] = amd_bytealign (w[14], w[15], offset); + w[37] = amd_bytealign (w[13], w[14], offset); + w[36] = amd_bytealign (w[12], w[13], offset); + w[35] = amd_bytealign (w[11], w[12], offset); + w[34] = amd_bytealign (w[10], w[11], offset); + w[33] = amd_bytealign (w[ 9], w[10], offset); + w[32] = amd_bytealign (w[ 8], w[ 9], offset); + w[31] = amd_bytealign (w[ 7], w[ 8], offset); + w[30] = amd_bytealign (w[ 6], w[ 7], offset); + w[29] = amd_bytealign (w[ 5], w[ 6], offset); + w[28] = amd_bytealign (w[ 4], w[ 5], offset); + w[27] = amd_bytealign (w[ 3], w[ 4], offset); + w[26] = amd_bytealign (w[ 2], w[ 3], offset); + w[25] = amd_bytealign (w[ 1], w[ 2], offset); + w[24] = amd_bytealign (w[ 0], w[ 1], offset); + w[23] = amd_bytealign ( 0, w[ 0], offset); + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 24: + w[63] = amd_bytealign (w[38], w[39], offset); + w[62] = amd_bytealign (w[37], w[38], offset); + w[61] = amd_bytealign (w[36], w[37], offset); + w[60] = amd_bytealign (w[35], w[36], offset); + w[59] = amd_bytealign (w[34], w[35], offset); + w[58] = amd_bytealign (w[33], w[34], offset); + w[57] = amd_bytealign (w[32], w[33], offset); + w[56] = amd_bytealign (w[31], w[32], offset); + w[55] = amd_bytealign (w[30], w[31], offset); + w[54] = amd_bytealign (w[29], w[30], offset); + w[53] = amd_bytealign (w[28], w[29], offset); + w[52] = amd_bytealign (w[27], w[28], offset); + w[51] = amd_bytealign (w[26], w[27], offset); + w[50] = amd_bytealign (w[25], w[26], offset); + w[49] = amd_bytealign (w[24], w[25], offset); + w[48] = amd_bytealign (w[23], w[24], offset); + w[47] = amd_bytealign (w[22], w[23], offset); + w[46] = amd_bytealign (w[21], w[22], offset); + w[45] = amd_bytealign (w[20], w[21], offset); + w[44] = amd_bytealign (w[19], w[20], offset); + w[43] = amd_bytealign (w[18], w[19], offset); + w[42] = amd_bytealign (w[17], w[18], offset); + w[41] = amd_bytealign (w[16], w[17], offset); + w[40] = amd_bytealign (w[15], w[16], offset); + w[39] = amd_bytealign (w[14], w[15], offset); + w[38] = amd_bytealign (w[13], w[14], offset); + w[37] = amd_bytealign (w[12], w[13], offset); + w[36] = amd_bytealign (w[11], w[12], offset); + w[35] = amd_bytealign (w[10], w[11], offset); + w[34] = amd_bytealign (w[ 9], w[10], offset); + w[33] = amd_bytealign (w[ 8], w[ 9], offset); + w[32] = amd_bytealign (w[ 7], w[ 8], offset); + w[31] = amd_bytealign (w[ 6], w[ 7], offset); + w[30] = amd_bytealign (w[ 5], w[ 6], offset); + w[29] = amd_bytealign (w[ 4], w[ 5], offset); + w[28] = amd_bytealign (w[ 3], w[ 4], offset); + w[27] = amd_bytealign (w[ 2], w[ 3], offset); + w[26] = amd_bytealign (w[ 1], w[ 2], offset); + w[25] = amd_bytealign (w[ 0], w[ 1], offset); + w[24] = amd_bytealign ( 0, w[ 0], offset); + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 25: + w[63] = amd_bytealign (w[37], w[38], offset); + w[62] = amd_bytealign (w[36], w[37], offset); + w[61] = amd_bytealign (w[35], w[36], offset); + w[60] = amd_bytealign (w[34], w[35], offset); + w[59] = amd_bytealign (w[33], w[34], offset); + w[58] = amd_bytealign (w[32], w[33], offset); + w[57] = amd_bytealign (w[31], w[32], offset); + w[56] = amd_bytealign (w[30], w[31], offset); + w[55] = amd_bytealign (w[29], w[30], offset); + w[54] = amd_bytealign (w[28], w[29], offset); + w[53] = amd_bytealign (w[27], w[28], offset); + w[52] = amd_bytealign (w[26], w[27], offset); + w[51] = amd_bytealign (w[25], w[26], offset); + w[50] = amd_bytealign (w[24], w[25], offset); + w[49] = amd_bytealign (w[23], w[24], offset); + w[48] = amd_bytealign (w[22], w[23], offset); + w[47] = amd_bytealign (w[21], w[22], offset); + w[46] = amd_bytealign (w[20], w[21], offset); + w[45] = amd_bytealign (w[19], w[20], offset); + w[44] = amd_bytealign (w[18], w[19], offset); + w[43] = amd_bytealign (w[17], w[18], offset); + w[42] = amd_bytealign (w[16], w[17], offset); + w[41] = amd_bytealign (w[15], w[16], offset); + w[40] = amd_bytealign (w[14], w[15], offset); + w[39] = amd_bytealign (w[13], w[14], offset); + w[38] = amd_bytealign (w[12], w[13], offset); + w[37] = amd_bytealign (w[11], w[12], offset); + w[36] = amd_bytealign (w[10], w[11], offset); + w[35] = amd_bytealign (w[ 9], w[10], offset); + w[34] = amd_bytealign (w[ 8], w[ 9], offset); + w[33] = amd_bytealign (w[ 7], w[ 8], offset); + w[32] = amd_bytealign (w[ 6], w[ 7], offset); + w[31] = amd_bytealign (w[ 5], w[ 6], offset); + w[30] = amd_bytealign (w[ 4], w[ 5], offset); + w[29] = amd_bytealign (w[ 3], w[ 4], offset); + w[28] = amd_bytealign (w[ 2], w[ 3], offset); + w[27] = amd_bytealign (w[ 1], w[ 2], offset); + w[26] = amd_bytealign (w[ 0], w[ 1], offset); + w[25] = amd_bytealign ( 0, w[ 0], offset); + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 26: + w[63] = amd_bytealign (w[36], w[37], offset); + w[62] = amd_bytealign (w[35], w[36], offset); + w[61] = amd_bytealign (w[34], w[35], offset); + w[60] = amd_bytealign (w[33], w[34], offset); + w[59] = amd_bytealign (w[32], w[33], offset); + w[58] = amd_bytealign (w[31], w[32], offset); + w[57] = amd_bytealign (w[30], w[31], offset); + w[56] = amd_bytealign (w[29], w[30], offset); + w[55] = amd_bytealign (w[28], w[29], offset); + w[54] = amd_bytealign (w[27], w[28], offset); + w[53] = amd_bytealign (w[26], w[27], offset); + w[52] = amd_bytealign (w[25], w[26], offset); + w[51] = amd_bytealign (w[24], w[25], offset); + w[50] = amd_bytealign (w[23], w[24], offset); + w[49] = amd_bytealign (w[22], w[23], offset); + w[48] = amd_bytealign (w[21], w[22], offset); + w[47] = amd_bytealign (w[20], w[21], offset); + w[46] = amd_bytealign (w[19], w[20], offset); + w[45] = amd_bytealign (w[18], w[19], offset); + w[44] = amd_bytealign (w[17], w[18], offset); + w[43] = amd_bytealign (w[16], w[17], offset); + w[42] = amd_bytealign (w[15], w[16], offset); + w[41] = amd_bytealign (w[14], w[15], offset); + w[40] = amd_bytealign (w[13], w[14], offset); + w[39] = amd_bytealign (w[12], w[13], offset); + w[38] = amd_bytealign (w[11], w[12], offset); + w[37] = amd_bytealign (w[10], w[11], offset); + w[36] = amd_bytealign (w[ 9], w[10], offset); + w[35] = amd_bytealign (w[ 8], w[ 9], offset); + w[34] = amd_bytealign (w[ 7], w[ 8], offset); + w[33] = amd_bytealign (w[ 6], w[ 7], offset); + w[32] = amd_bytealign (w[ 5], w[ 6], offset); + w[31] = amd_bytealign (w[ 4], w[ 5], offset); + w[30] = amd_bytealign (w[ 3], w[ 4], offset); + w[29] = amd_bytealign (w[ 2], w[ 3], offset); + w[28] = amd_bytealign (w[ 1], w[ 2], offset); + w[27] = amd_bytealign (w[ 0], w[ 1], offset); + w[26] = amd_bytealign ( 0, w[ 0], offset); + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 27: + w[63] = amd_bytealign (w[35], w[36], offset); + w[62] = amd_bytealign (w[34], w[35], offset); + w[61] = amd_bytealign (w[33], w[34], offset); + w[60] = amd_bytealign (w[32], w[33], offset); + w[59] = amd_bytealign (w[31], w[32], offset); + w[58] = amd_bytealign (w[30], w[31], offset); + w[57] = amd_bytealign (w[29], w[30], offset); + w[56] = amd_bytealign (w[28], w[29], offset); + w[55] = amd_bytealign (w[27], w[28], offset); + w[54] = amd_bytealign (w[26], w[27], offset); + w[53] = amd_bytealign (w[25], w[26], offset); + w[52] = amd_bytealign (w[24], w[25], offset); + w[51] = amd_bytealign (w[23], w[24], offset); + w[50] = amd_bytealign (w[22], w[23], offset); + w[49] = amd_bytealign (w[21], w[22], offset); + w[48] = amd_bytealign (w[20], w[21], offset); + w[47] = amd_bytealign (w[19], w[20], offset); + w[46] = amd_bytealign (w[18], w[19], offset); + w[45] = amd_bytealign (w[17], w[18], offset); + w[44] = amd_bytealign (w[16], w[17], offset); + w[43] = amd_bytealign (w[15], w[16], offset); + w[42] = amd_bytealign (w[14], w[15], offset); + w[41] = amd_bytealign (w[13], w[14], offset); + w[40] = amd_bytealign (w[12], w[13], offset); + w[39] = amd_bytealign (w[11], w[12], offset); + w[38] = amd_bytealign (w[10], w[11], offset); + w[37] = amd_bytealign (w[ 9], w[10], offset); + w[36] = amd_bytealign (w[ 8], w[ 9], offset); + w[35] = amd_bytealign (w[ 7], w[ 8], offset); + w[34] = amd_bytealign (w[ 6], w[ 7], offset); + w[33] = amd_bytealign (w[ 5], w[ 6], offset); + w[32] = amd_bytealign (w[ 4], w[ 5], offset); + w[31] = amd_bytealign (w[ 3], w[ 4], offset); + w[30] = amd_bytealign (w[ 2], w[ 3], offset); + w[29] = amd_bytealign (w[ 1], w[ 2], offset); + w[28] = amd_bytealign (w[ 0], w[ 1], offset); + w[27] = amd_bytealign ( 0, w[ 0], offset); + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 28: + w[63] = amd_bytealign (w[34], w[35], offset); + w[62] = amd_bytealign (w[33], w[34], offset); + w[61] = amd_bytealign (w[32], w[33], offset); + w[60] = amd_bytealign (w[31], w[32], offset); + w[59] = amd_bytealign (w[30], w[31], offset); + w[58] = amd_bytealign (w[29], w[30], offset); + w[57] = amd_bytealign (w[28], w[29], offset); + w[56] = amd_bytealign (w[27], w[28], offset); + w[55] = amd_bytealign (w[26], w[27], offset); + w[54] = amd_bytealign (w[25], w[26], offset); + w[53] = amd_bytealign (w[24], w[25], offset); + w[52] = amd_bytealign (w[23], w[24], offset); + w[51] = amd_bytealign (w[22], w[23], offset); + w[50] = amd_bytealign (w[21], w[22], offset); + w[49] = amd_bytealign (w[20], w[21], offset); + w[48] = amd_bytealign (w[19], w[20], offset); + w[47] = amd_bytealign (w[18], w[19], offset); + w[46] = amd_bytealign (w[17], w[18], offset); + w[45] = amd_bytealign (w[16], w[17], offset); + w[44] = amd_bytealign (w[15], w[16], offset); + w[43] = amd_bytealign (w[14], w[15], offset); + w[42] = amd_bytealign (w[13], w[14], offset); + w[41] = amd_bytealign (w[12], w[13], offset); + w[40] = amd_bytealign (w[11], w[12], offset); + w[39] = amd_bytealign (w[10], w[11], offset); + w[38] = amd_bytealign (w[ 9], w[10], offset); + w[37] = amd_bytealign (w[ 8], w[ 9], offset); + w[36] = amd_bytealign (w[ 7], w[ 8], offset); + w[35] = amd_bytealign (w[ 6], w[ 7], offset); + w[34] = amd_bytealign (w[ 5], w[ 6], offset); + w[33] = amd_bytealign (w[ 4], w[ 5], offset); + w[32] = amd_bytealign (w[ 3], w[ 4], offset); + w[31] = amd_bytealign (w[ 2], w[ 3], offset); + w[30] = amd_bytealign (w[ 1], w[ 2], offset); + w[29] = amd_bytealign (w[ 0], w[ 1], offset); + w[28] = amd_bytealign ( 0, w[ 0], offset); + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 29: + w[63] = amd_bytealign (w[33], w[34], offset); + w[62] = amd_bytealign (w[32], w[33], offset); + w[61] = amd_bytealign (w[31], w[32], offset); + w[60] = amd_bytealign (w[30], w[31], offset); + w[59] = amd_bytealign (w[29], w[30], offset); + w[58] = amd_bytealign (w[28], w[29], offset); + w[57] = amd_bytealign (w[27], w[28], offset); + w[56] = amd_bytealign (w[26], w[27], offset); + w[55] = amd_bytealign (w[25], w[26], offset); + w[54] = amd_bytealign (w[24], w[25], offset); + w[53] = amd_bytealign (w[23], w[24], offset); + w[52] = amd_bytealign (w[22], w[23], offset); + w[51] = amd_bytealign (w[21], w[22], offset); + w[50] = amd_bytealign (w[20], w[21], offset); + w[49] = amd_bytealign (w[19], w[20], offset); + w[48] = amd_bytealign (w[18], w[19], offset); + w[47] = amd_bytealign (w[17], w[18], offset); + w[46] = amd_bytealign (w[16], w[17], offset); + w[45] = amd_bytealign (w[15], w[16], offset); + w[44] = amd_bytealign (w[14], w[15], offset); + w[43] = amd_bytealign (w[13], w[14], offset); + w[42] = amd_bytealign (w[12], w[13], offset); + w[41] = amd_bytealign (w[11], w[12], offset); + w[40] = amd_bytealign (w[10], w[11], offset); + w[39] = amd_bytealign (w[ 9], w[10], offset); + w[38] = amd_bytealign (w[ 8], w[ 9], offset); + w[37] = amd_bytealign (w[ 7], w[ 8], offset); + w[36] = amd_bytealign (w[ 6], w[ 7], offset); + w[35] = amd_bytealign (w[ 5], w[ 6], offset); + w[34] = amd_bytealign (w[ 4], w[ 5], offset); + w[33] = amd_bytealign (w[ 3], w[ 4], offset); + w[32] = amd_bytealign (w[ 2], w[ 3], offset); + w[31] = amd_bytealign (w[ 1], w[ 2], offset); + w[30] = amd_bytealign (w[ 0], w[ 1], offset); + w[29] = amd_bytealign ( 0, w[ 0], offset); + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 30: + w[63] = amd_bytealign (w[32], w[33], offset); + w[62] = amd_bytealign (w[31], w[32], offset); + w[61] = amd_bytealign (w[30], w[31], offset); + w[60] = amd_bytealign (w[29], w[30], offset); + w[59] = amd_bytealign (w[28], w[29], offset); + w[58] = amd_bytealign (w[27], w[28], offset); + w[57] = amd_bytealign (w[26], w[27], offset); + w[56] = amd_bytealign (w[25], w[26], offset); + w[55] = amd_bytealign (w[24], w[25], offset); + w[54] = amd_bytealign (w[23], w[24], offset); + w[53] = amd_bytealign (w[22], w[23], offset); + w[52] = amd_bytealign (w[21], w[22], offset); + w[51] = amd_bytealign (w[20], w[21], offset); + w[50] = amd_bytealign (w[19], w[20], offset); + w[49] = amd_bytealign (w[18], w[19], offset); + w[48] = amd_bytealign (w[17], w[18], offset); + w[47] = amd_bytealign (w[16], w[17], offset); + w[46] = amd_bytealign (w[15], w[16], offset); + w[45] = amd_bytealign (w[14], w[15], offset); + w[44] = amd_bytealign (w[13], w[14], offset); + w[43] = amd_bytealign (w[12], w[13], offset); + w[42] = amd_bytealign (w[11], w[12], offset); + w[41] = amd_bytealign (w[10], w[11], offset); + w[40] = amd_bytealign (w[ 9], w[10], offset); + w[39] = amd_bytealign (w[ 8], w[ 9], offset); + w[38] = amd_bytealign (w[ 7], w[ 8], offset); + w[37] = amd_bytealign (w[ 6], w[ 7], offset); + w[36] = amd_bytealign (w[ 5], w[ 6], offset); + w[35] = amd_bytealign (w[ 4], w[ 5], offset); + w[34] = amd_bytealign (w[ 3], w[ 4], offset); + w[33] = amd_bytealign (w[ 2], w[ 3], offset); + w[32] = amd_bytealign (w[ 1], w[ 2], offset); + w[31] = amd_bytealign (w[ 0], w[ 1], offset); + w[30] = amd_bytealign ( 0, w[ 0], offset); + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 31: + w[63] = amd_bytealign (w[31], w[32], offset); + w[62] = amd_bytealign (w[30], w[31], offset); + w[61] = amd_bytealign (w[29], w[30], offset); + w[60] = amd_bytealign (w[28], w[29], offset); + w[59] = amd_bytealign (w[27], w[28], offset); + w[58] = amd_bytealign (w[26], w[27], offset); + w[57] = amd_bytealign (w[25], w[26], offset); + w[56] = amd_bytealign (w[24], w[25], offset); + w[55] = amd_bytealign (w[23], w[24], offset); + w[54] = amd_bytealign (w[22], w[23], offset); + w[53] = amd_bytealign (w[21], w[22], offset); + w[52] = amd_bytealign (w[20], w[21], offset); + w[51] = amd_bytealign (w[19], w[20], offset); + w[50] = amd_bytealign (w[18], w[19], offset); + w[49] = amd_bytealign (w[17], w[18], offset); + w[48] = amd_bytealign (w[16], w[17], offset); + w[47] = amd_bytealign (w[15], w[16], offset); + w[46] = amd_bytealign (w[14], w[15], offset); + w[45] = amd_bytealign (w[13], w[14], offset); + w[44] = amd_bytealign (w[12], w[13], offset); + w[43] = amd_bytealign (w[11], w[12], offset); + w[42] = amd_bytealign (w[10], w[11], offset); + w[41] = amd_bytealign (w[ 9], w[10], offset); + w[40] = amd_bytealign (w[ 8], w[ 9], offset); + w[39] = amd_bytealign (w[ 7], w[ 8], offset); + w[38] = amd_bytealign (w[ 6], w[ 7], offset); + w[37] = amd_bytealign (w[ 5], w[ 6], offset); + w[36] = amd_bytealign (w[ 4], w[ 5], offset); + w[35] = amd_bytealign (w[ 3], w[ 4], offset); + w[34] = amd_bytealign (w[ 2], w[ 3], offset); + w[33] = amd_bytealign (w[ 1], w[ 2], offset); + w[32] = amd_bytealign (w[ 0], w[ 1], offset); + w[31] = amd_bytealign ( 0, w[ 0], offset); + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 32: + w[63] = amd_bytealign (w[30], w[31], offset); + w[62] = amd_bytealign (w[29], w[30], offset); + w[61] = amd_bytealign (w[28], w[29], offset); + w[60] = amd_bytealign (w[27], w[28], offset); + w[59] = amd_bytealign (w[26], w[27], offset); + w[58] = amd_bytealign (w[25], w[26], offset); + w[57] = amd_bytealign (w[24], w[25], offset); + w[56] = amd_bytealign (w[23], w[24], offset); + w[55] = amd_bytealign (w[22], w[23], offset); + w[54] = amd_bytealign (w[21], w[22], offset); + w[53] = amd_bytealign (w[20], w[21], offset); + w[52] = amd_bytealign (w[19], w[20], offset); + w[51] = amd_bytealign (w[18], w[19], offset); + w[50] = amd_bytealign (w[17], w[18], offset); + w[49] = amd_bytealign (w[16], w[17], offset); + w[48] = amd_bytealign (w[15], w[16], offset); + w[47] = amd_bytealign (w[14], w[15], offset); + w[46] = amd_bytealign (w[13], w[14], offset); + w[45] = amd_bytealign (w[12], w[13], offset); + w[44] = amd_bytealign (w[11], w[12], offset); + w[43] = amd_bytealign (w[10], w[11], offset); + w[42] = amd_bytealign (w[ 9], w[10], offset); + w[41] = amd_bytealign (w[ 8], w[ 9], offset); + w[40] = amd_bytealign (w[ 7], w[ 8], offset); + w[39] = amd_bytealign (w[ 6], w[ 7], offset); + w[38] = amd_bytealign (w[ 5], w[ 6], offset); + w[37] = amd_bytealign (w[ 4], w[ 5], offset); + w[36] = amd_bytealign (w[ 3], w[ 4], offset); + w[35] = amd_bytealign (w[ 2], w[ 3], offset); + w[34] = amd_bytealign (w[ 1], w[ 2], offset); + w[33] = amd_bytealign (w[ 0], w[ 1], offset); + w[32] = amd_bytealign ( 0, w[ 0], offset); + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 33: + w[63] = amd_bytealign (w[29], w[30], offset); + w[62] = amd_bytealign (w[28], w[29], offset); + w[61] = amd_bytealign (w[27], w[28], offset); + w[60] = amd_bytealign (w[26], w[27], offset); + w[59] = amd_bytealign (w[25], w[26], offset); + w[58] = amd_bytealign (w[24], w[25], offset); + w[57] = amd_bytealign (w[23], w[24], offset); + w[56] = amd_bytealign (w[22], w[23], offset); + w[55] = amd_bytealign (w[21], w[22], offset); + w[54] = amd_bytealign (w[20], w[21], offset); + w[53] = amd_bytealign (w[19], w[20], offset); + w[52] = amd_bytealign (w[18], w[19], offset); + w[51] = amd_bytealign (w[17], w[18], offset); + w[50] = amd_bytealign (w[16], w[17], offset); + w[49] = amd_bytealign (w[15], w[16], offset); + w[48] = amd_bytealign (w[14], w[15], offset); + w[47] = amd_bytealign (w[13], w[14], offset); + w[46] = amd_bytealign (w[12], w[13], offset); + w[45] = amd_bytealign (w[11], w[12], offset); + w[44] = amd_bytealign (w[10], w[11], offset); + w[43] = amd_bytealign (w[ 9], w[10], offset); + w[42] = amd_bytealign (w[ 8], w[ 9], offset); + w[41] = amd_bytealign (w[ 7], w[ 8], offset); + w[40] = amd_bytealign (w[ 6], w[ 7], offset); + w[39] = amd_bytealign (w[ 5], w[ 6], offset); + w[38] = amd_bytealign (w[ 4], w[ 5], offset); + w[37] = amd_bytealign (w[ 3], w[ 4], offset); + w[36] = amd_bytealign (w[ 2], w[ 3], offset); + w[35] = amd_bytealign (w[ 1], w[ 2], offset); + w[34] = amd_bytealign (w[ 0], w[ 1], offset); + w[33] = amd_bytealign ( 0, w[ 0], offset); + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 34: + w[63] = amd_bytealign (w[28], w[29], offset); + w[62] = amd_bytealign (w[27], w[28], offset); + w[61] = amd_bytealign (w[26], w[27], offset); + w[60] = amd_bytealign (w[25], w[26], offset); + w[59] = amd_bytealign (w[24], w[25], offset); + w[58] = amd_bytealign (w[23], w[24], offset); + w[57] = amd_bytealign (w[22], w[23], offset); + w[56] = amd_bytealign (w[21], w[22], offset); + w[55] = amd_bytealign (w[20], w[21], offset); + w[54] = amd_bytealign (w[19], w[20], offset); + w[53] = amd_bytealign (w[18], w[19], offset); + w[52] = amd_bytealign (w[17], w[18], offset); + w[51] = amd_bytealign (w[16], w[17], offset); + w[50] = amd_bytealign (w[15], w[16], offset); + w[49] = amd_bytealign (w[14], w[15], offset); + w[48] = amd_bytealign (w[13], w[14], offset); + w[47] = amd_bytealign (w[12], w[13], offset); + w[46] = amd_bytealign (w[11], w[12], offset); + w[45] = amd_bytealign (w[10], w[11], offset); + w[44] = amd_bytealign (w[ 9], w[10], offset); + w[43] = amd_bytealign (w[ 8], w[ 9], offset); + w[42] = amd_bytealign (w[ 7], w[ 8], offset); + w[41] = amd_bytealign (w[ 6], w[ 7], offset); + w[40] = amd_bytealign (w[ 5], w[ 6], offset); + w[39] = amd_bytealign (w[ 4], w[ 5], offset); + w[38] = amd_bytealign (w[ 3], w[ 4], offset); + w[37] = amd_bytealign (w[ 2], w[ 3], offset); + w[36] = amd_bytealign (w[ 1], w[ 2], offset); + w[35] = amd_bytealign (w[ 0], w[ 1], offset); + w[34] = amd_bytealign ( 0, w[ 0], offset); + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 35: + w[63] = amd_bytealign (w[27], w[28], offset); + w[62] = amd_bytealign (w[26], w[27], offset); + w[61] = amd_bytealign (w[25], w[26], offset); + w[60] = amd_bytealign (w[24], w[25], offset); + w[59] = amd_bytealign (w[23], w[24], offset); + w[58] = amd_bytealign (w[22], w[23], offset); + w[57] = amd_bytealign (w[21], w[22], offset); + w[56] = amd_bytealign (w[20], w[21], offset); + w[55] = amd_bytealign (w[19], w[20], offset); + w[54] = amd_bytealign (w[18], w[19], offset); + w[53] = amd_bytealign (w[17], w[18], offset); + w[52] = amd_bytealign (w[16], w[17], offset); + w[51] = amd_bytealign (w[15], w[16], offset); + w[50] = amd_bytealign (w[14], w[15], offset); + w[49] = amd_bytealign (w[13], w[14], offset); + w[48] = amd_bytealign (w[12], w[13], offset); + w[47] = amd_bytealign (w[11], w[12], offset); + w[46] = amd_bytealign (w[10], w[11], offset); + w[45] = amd_bytealign (w[ 9], w[10], offset); + w[44] = amd_bytealign (w[ 8], w[ 9], offset); + w[43] = amd_bytealign (w[ 7], w[ 8], offset); + w[42] = amd_bytealign (w[ 6], w[ 7], offset); + w[41] = amd_bytealign (w[ 5], w[ 6], offset); + w[40] = amd_bytealign (w[ 4], w[ 5], offset); + w[39] = amd_bytealign (w[ 3], w[ 4], offset); + w[38] = amd_bytealign (w[ 2], w[ 3], offset); + w[37] = amd_bytealign (w[ 1], w[ 2], offset); + w[36] = amd_bytealign (w[ 0], w[ 1], offset); + w[35] = amd_bytealign ( 0, w[ 0], offset); + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 36: + w[63] = amd_bytealign (w[26], w[27], offset); + w[62] = amd_bytealign (w[25], w[26], offset); + w[61] = amd_bytealign (w[24], w[25], offset); + w[60] = amd_bytealign (w[23], w[24], offset); + w[59] = amd_bytealign (w[22], w[23], offset); + w[58] = amd_bytealign (w[21], w[22], offset); + w[57] = amd_bytealign (w[20], w[21], offset); + w[56] = amd_bytealign (w[19], w[20], offset); + w[55] = amd_bytealign (w[18], w[19], offset); + w[54] = amd_bytealign (w[17], w[18], offset); + w[53] = amd_bytealign (w[16], w[17], offset); + w[52] = amd_bytealign (w[15], w[16], offset); + w[51] = amd_bytealign (w[14], w[15], offset); + w[50] = amd_bytealign (w[13], w[14], offset); + w[49] = amd_bytealign (w[12], w[13], offset); + w[48] = amd_bytealign (w[11], w[12], offset); + w[47] = amd_bytealign (w[10], w[11], offset); + w[46] = amd_bytealign (w[ 9], w[10], offset); + w[45] = amd_bytealign (w[ 8], w[ 9], offset); + w[44] = amd_bytealign (w[ 7], w[ 8], offset); + w[43] = amd_bytealign (w[ 6], w[ 7], offset); + w[42] = amd_bytealign (w[ 5], w[ 6], offset); + w[41] = amd_bytealign (w[ 4], w[ 5], offset); + w[40] = amd_bytealign (w[ 3], w[ 4], offset); + w[39] = amd_bytealign (w[ 2], w[ 3], offset); + w[38] = amd_bytealign (w[ 1], w[ 2], offset); + w[37] = amd_bytealign (w[ 0], w[ 1], offset); + w[36] = amd_bytealign ( 0, w[ 0], offset); + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 37: + w[63] = amd_bytealign (w[25], w[26], offset); + w[62] = amd_bytealign (w[24], w[25], offset); + w[61] = amd_bytealign (w[23], w[24], offset); + w[60] = amd_bytealign (w[22], w[23], offset); + w[59] = amd_bytealign (w[21], w[22], offset); + w[58] = amd_bytealign (w[20], w[21], offset); + w[57] = amd_bytealign (w[19], w[20], offset); + w[56] = amd_bytealign (w[18], w[19], offset); + w[55] = amd_bytealign (w[17], w[18], offset); + w[54] = amd_bytealign (w[16], w[17], offset); + w[53] = amd_bytealign (w[15], w[16], offset); + w[52] = amd_bytealign (w[14], w[15], offset); + w[51] = amd_bytealign (w[13], w[14], offset); + w[50] = amd_bytealign (w[12], w[13], offset); + w[49] = amd_bytealign (w[11], w[12], offset); + w[48] = amd_bytealign (w[10], w[11], offset); + w[47] = amd_bytealign (w[ 9], w[10], offset); + w[46] = amd_bytealign (w[ 8], w[ 9], offset); + w[45] = amd_bytealign (w[ 7], w[ 8], offset); + w[44] = amd_bytealign (w[ 6], w[ 7], offset); + w[43] = amd_bytealign (w[ 5], w[ 6], offset); + w[42] = amd_bytealign (w[ 4], w[ 5], offset); + w[41] = amd_bytealign (w[ 3], w[ 4], offset); + w[40] = amd_bytealign (w[ 2], w[ 3], offset); + w[39] = amd_bytealign (w[ 1], w[ 2], offset); + w[38] = amd_bytealign (w[ 0], w[ 1], offset); + w[37] = amd_bytealign ( 0, w[ 0], offset); + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 38: + w[63] = amd_bytealign (w[24], w[25], offset); + w[62] = amd_bytealign (w[23], w[24], offset); + w[61] = amd_bytealign (w[22], w[23], offset); + w[60] = amd_bytealign (w[21], w[22], offset); + w[59] = amd_bytealign (w[20], w[21], offset); + w[58] = amd_bytealign (w[19], w[20], offset); + w[57] = amd_bytealign (w[18], w[19], offset); + w[56] = amd_bytealign (w[17], w[18], offset); + w[55] = amd_bytealign (w[16], w[17], offset); + w[54] = amd_bytealign (w[15], w[16], offset); + w[53] = amd_bytealign (w[14], w[15], offset); + w[52] = amd_bytealign (w[13], w[14], offset); + w[51] = amd_bytealign (w[12], w[13], offset); + w[50] = amd_bytealign (w[11], w[12], offset); + w[49] = amd_bytealign (w[10], w[11], offset); + w[48] = amd_bytealign (w[ 9], w[10], offset); + w[47] = amd_bytealign (w[ 8], w[ 9], offset); + w[46] = amd_bytealign (w[ 7], w[ 8], offset); + w[45] = amd_bytealign (w[ 6], w[ 7], offset); + w[44] = amd_bytealign (w[ 5], w[ 6], offset); + w[43] = amd_bytealign (w[ 4], w[ 5], offset); + w[42] = amd_bytealign (w[ 3], w[ 4], offset); + w[41] = amd_bytealign (w[ 2], w[ 3], offset); + w[40] = amd_bytealign (w[ 1], w[ 2], offset); + w[39] = amd_bytealign (w[ 0], w[ 1], offset); + w[38] = amd_bytealign ( 0, w[ 0], offset); + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 39: + w[63] = amd_bytealign (w[23], w[24], offset); + w[62] = amd_bytealign (w[22], w[23], offset); + w[61] = amd_bytealign (w[21], w[22], offset); + w[60] = amd_bytealign (w[20], w[21], offset); + w[59] = amd_bytealign (w[19], w[20], offset); + w[58] = amd_bytealign (w[18], w[19], offset); + w[57] = amd_bytealign (w[17], w[18], offset); + w[56] = amd_bytealign (w[16], w[17], offset); + w[55] = amd_bytealign (w[15], w[16], offset); + w[54] = amd_bytealign (w[14], w[15], offset); + w[53] = amd_bytealign (w[13], w[14], offset); + w[52] = amd_bytealign (w[12], w[13], offset); + w[51] = amd_bytealign (w[11], w[12], offset); + w[50] = amd_bytealign (w[10], w[11], offset); + w[49] = amd_bytealign (w[ 9], w[10], offset); + w[48] = amd_bytealign (w[ 8], w[ 9], offset); + w[47] = amd_bytealign (w[ 7], w[ 8], offset); + w[46] = amd_bytealign (w[ 6], w[ 7], offset); + w[45] = amd_bytealign (w[ 5], w[ 6], offset); + w[44] = amd_bytealign (w[ 4], w[ 5], offset); + w[43] = amd_bytealign (w[ 3], w[ 4], offset); + w[42] = amd_bytealign (w[ 2], w[ 3], offset); + w[41] = amd_bytealign (w[ 1], w[ 2], offset); + w[40] = amd_bytealign (w[ 0], w[ 1], offset); + w[39] = amd_bytealign ( 0, w[ 0], offset); + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 40: + w[63] = amd_bytealign (w[22], w[23], offset); + w[62] = amd_bytealign (w[21], w[22], offset); + w[61] = amd_bytealign (w[20], w[21], offset); + w[60] = amd_bytealign (w[19], w[20], offset); + w[59] = amd_bytealign (w[18], w[19], offset); + w[58] = amd_bytealign (w[17], w[18], offset); + w[57] = amd_bytealign (w[16], w[17], offset); + w[56] = amd_bytealign (w[15], w[16], offset); + w[55] = amd_bytealign (w[14], w[15], offset); + w[54] = amd_bytealign (w[13], w[14], offset); + w[53] = amd_bytealign (w[12], w[13], offset); + w[52] = amd_bytealign (w[11], w[12], offset); + w[51] = amd_bytealign (w[10], w[11], offset); + w[50] = amd_bytealign (w[ 9], w[10], offset); + w[49] = amd_bytealign (w[ 8], w[ 9], offset); + w[48] = amd_bytealign (w[ 7], w[ 8], offset); + w[47] = amd_bytealign (w[ 6], w[ 7], offset); + w[46] = amd_bytealign (w[ 5], w[ 6], offset); + w[45] = amd_bytealign (w[ 4], w[ 5], offset); + w[44] = amd_bytealign (w[ 3], w[ 4], offset); + w[43] = amd_bytealign (w[ 2], w[ 3], offset); + w[42] = amd_bytealign (w[ 1], w[ 2], offset); + w[41] = amd_bytealign (w[ 0], w[ 1], offset); + w[40] = amd_bytealign ( 0, w[ 0], offset); + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 41: + w[63] = amd_bytealign (w[21], w[22], offset); + w[62] = amd_bytealign (w[20], w[21], offset); + w[61] = amd_bytealign (w[19], w[20], offset); + w[60] = amd_bytealign (w[18], w[19], offset); + w[59] = amd_bytealign (w[17], w[18], offset); + w[58] = amd_bytealign (w[16], w[17], offset); + w[57] = amd_bytealign (w[15], w[16], offset); + w[56] = amd_bytealign (w[14], w[15], offset); + w[55] = amd_bytealign (w[13], w[14], offset); + w[54] = amd_bytealign (w[12], w[13], offset); + w[53] = amd_bytealign (w[11], w[12], offset); + w[52] = amd_bytealign (w[10], w[11], offset); + w[51] = amd_bytealign (w[ 9], w[10], offset); + w[50] = amd_bytealign (w[ 8], w[ 9], offset); + w[49] = amd_bytealign (w[ 7], w[ 8], offset); + w[48] = amd_bytealign (w[ 6], w[ 7], offset); + w[47] = amd_bytealign (w[ 5], w[ 6], offset); + w[46] = amd_bytealign (w[ 4], w[ 5], offset); + w[45] = amd_bytealign (w[ 3], w[ 4], offset); + w[44] = amd_bytealign (w[ 2], w[ 3], offset); + w[43] = amd_bytealign (w[ 1], w[ 2], offset); + w[42] = amd_bytealign (w[ 0], w[ 1], offset); + w[41] = amd_bytealign ( 0, w[ 0], offset); + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 42: + w[63] = amd_bytealign (w[20], w[21], offset); + w[62] = amd_bytealign (w[19], w[20], offset); + w[61] = amd_bytealign (w[18], w[19], offset); + w[60] = amd_bytealign (w[17], w[18], offset); + w[59] = amd_bytealign (w[16], w[17], offset); + w[58] = amd_bytealign (w[15], w[16], offset); + w[57] = amd_bytealign (w[14], w[15], offset); + w[56] = amd_bytealign (w[13], w[14], offset); + w[55] = amd_bytealign (w[12], w[13], offset); + w[54] = amd_bytealign (w[11], w[12], offset); + w[53] = amd_bytealign (w[10], w[11], offset); + w[52] = amd_bytealign (w[ 9], w[10], offset); + w[51] = amd_bytealign (w[ 8], w[ 9], offset); + w[50] = amd_bytealign (w[ 7], w[ 8], offset); + w[49] = amd_bytealign (w[ 6], w[ 7], offset); + w[48] = amd_bytealign (w[ 5], w[ 6], offset); + w[47] = amd_bytealign (w[ 4], w[ 5], offset); + w[46] = amd_bytealign (w[ 3], w[ 4], offset); + w[45] = amd_bytealign (w[ 2], w[ 3], offset); + w[44] = amd_bytealign (w[ 1], w[ 2], offset); + w[43] = amd_bytealign (w[ 0], w[ 1], offset); + w[42] = amd_bytealign ( 0, w[ 0], offset); + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 43: + w[63] = amd_bytealign (w[19], w[20], offset); + w[62] = amd_bytealign (w[18], w[19], offset); + w[61] = amd_bytealign (w[17], w[18], offset); + w[60] = amd_bytealign (w[16], w[17], offset); + w[59] = amd_bytealign (w[15], w[16], offset); + w[58] = amd_bytealign (w[14], w[15], offset); + w[57] = amd_bytealign (w[13], w[14], offset); + w[56] = amd_bytealign (w[12], w[13], offset); + w[55] = amd_bytealign (w[11], w[12], offset); + w[54] = amd_bytealign (w[10], w[11], offset); + w[53] = amd_bytealign (w[ 9], w[10], offset); + w[52] = amd_bytealign (w[ 8], w[ 9], offset); + w[51] = amd_bytealign (w[ 7], w[ 8], offset); + w[50] = amd_bytealign (w[ 6], w[ 7], offset); + w[49] = amd_bytealign (w[ 5], w[ 6], offset); + w[48] = amd_bytealign (w[ 4], w[ 5], offset); + w[47] = amd_bytealign (w[ 3], w[ 4], offset); + w[46] = amd_bytealign (w[ 2], w[ 3], offset); + w[45] = amd_bytealign (w[ 1], w[ 2], offset); + w[44] = amd_bytealign (w[ 0], w[ 1], offset); + w[43] = amd_bytealign ( 0, w[ 0], offset); + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 44: + w[63] = amd_bytealign (w[18], w[19], offset); + w[62] = amd_bytealign (w[17], w[18], offset); + w[61] = amd_bytealign (w[16], w[17], offset); + w[60] = amd_bytealign (w[15], w[16], offset); + w[59] = amd_bytealign (w[14], w[15], offset); + w[58] = amd_bytealign (w[13], w[14], offset); + w[57] = amd_bytealign (w[12], w[13], offset); + w[56] = amd_bytealign (w[11], w[12], offset); + w[55] = amd_bytealign (w[10], w[11], offset); + w[54] = amd_bytealign (w[ 9], w[10], offset); + w[53] = amd_bytealign (w[ 8], w[ 9], offset); + w[52] = amd_bytealign (w[ 7], w[ 8], offset); + w[51] = amd_bytealign (w[ 6], w[ 7], offset); + w[50] = amd_bytealign (w[ 5], w[ 6], offset); + w[49] = amd_bytealign (w[ 4], w[ 5], offset); + w[48] = amd_bytealign (w[ 3], w[ 4], offset); + w[47] = amd_bytealign (w[ 2], w[ 3], offset); + w[46] = amd_bytealign (w[ 1], w[ 2], offset); + w[45] = amd_bytealign (w[ 0], w[ 1], offset); + w[44] = amd_bytealign ( 0, w[ 0], offset); + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 45: + w[63] = amd_bytealign (w[17], w[18], offset); + w[62] = amd_bytealign (w[16], w[17], offset); + w[61] = amd_bytealign (w[15], w[16], offset); + w[60] = amd_bytealign (w[14], w[15], offset); + w[59] = amd_bytealign (w[13], w[14], offset); + w[58] = amd_bytealign (w[12], w[13], offset); + w[57] = amd_bytealign (w[11], w[12], offset); + w[56] = amd_bytealign (w[10], w[11], offset); + w[55] = amd_bytealign (w[ 9], w[10], offset); + w[54] = amd_bytealign (w[ 8], w[ 9], offset); + w[53] = amd_bytealign (w[ 7], w[ 8], offset); + w[52] = amd_bytealign (w[ 6], w[ 7], offset); + w[51] = amd_bytealign (w[ 5], w[ 6], offset); + w[50] = amd_bytealign (w[ 4], w[ 5], offset); + w[49] = amd_bytealign (w[ 3], w[ 4], offset); + w[48] = amd_bytealign (w[ 2], w[ 3], offset); + w[47] = amd_bytealign (w[ 1], w[ 2], offset); + w[46] = amd_bytealign (w[ 0], w[ 1], offset); + w[45] = amd_bytealign ( 0, w[ 0], offset); + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 46: + w[63] = amd_bytealign (w[16], w[17], offset); + w[62] = amd_bytealign (w[15], w[16], offset); + w[61] = amd_bytealign (w[14], w[15], offset); + w[60] = amd_bytealign (w[13], w[14], offset); + w[59] = amd_bytealign (w[12], w[13], offset); + w[58] = amd_bytealign (w[11], w[12], offset); + w[57] = amd_bytealign (w[10], w[11], offset); + w[56] = amd_bytealign (w[ 9], w[10], offset); + w[55] = amd_bytealign (w[ 8], w[ 9], offset); + w[54] = amd_bytealign (w[ 7], w[ 8], offset); + w[53] = amd_bytealign (w[ 6], w[ 7], offset); + w[52] = amd_bytealign (w[ 5], w[ 6], offset); + w[51] = amd_bytealign (w[ 4], w[ 5], offset); + w[50] = amd_bytealign (w[ 3], w[ 4], offset); + w[49] = amd_bytealign (w[ 2], w[ 3], offset); + w[48] = amd_bytealign (w[ 1], w[ 2], offset); + w[47] = amd_bytealign (w[ 0], w[ 1], offset); + w[46] = amd_bytealign ( 0, w[ 0], offset); + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 47: + w[63] = amd_bytealign (w[15], w[16], offset); + w[62] = amd_bytealign (w[14], w[15], offset); + w[61] = amd_bytealign (w[13], w[14], offset); + w[60] = amd_bytealign (w[12], w[13], offset); + w[59] = amd_bytealign (w[11], w[12], offset); + w[58] = amd_bytealign (w[10], w[11], offset); + w[57] = amd_bytealign (w[ 9], w[10], offset); + w[56] = amd_bytealign (w[ 8], w[ 9], offset); + w[55] = amd_bytealign (w[ 7], w[ 8], offset); + w[54] = amd_bytealign (w[ 6], w[ 7], offset); + w[53] = amd_bytealign (w[ 5], w[ 6], offset); + w[52] = amd_bytealign (w[ 4], w[ 5], offset); + w[51] = amd_bytealign (w[ 3], w[ 4], offset); + w[50] = amd_bytealign (w[ 2], w[ 3], offset); + w[49] = amd_bytealign (w[ 1], w[ 2], offset); + w[48] = amd_bytealign (w[ 0], w[ 1], offset); + w[47] = amd_bytealign ( 0, w[ 0], offset); + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 48: + w[63] = amd_bytealign (w[14], w[15], offset); + w[62] = amd_bytealign (w[13], w[14], offset); + w[61] = amd_bytealign (w[12], w[13], offset); + w[60] = amd_bytealign (w[11], w[12], offset); + w[59] = amd_bytealign (w[10], w[11], offset); + w[58] = amd_bytealign (w[ 9], w[10], offset); + w[57] = amd_bytealign (w[ 8], w[ 9], offset); + w[56] = amd_bytealign (w[ 7], w[ 8], offset); + w[55] = amd_bytealign (w[ 6], w[ 7], offset); + w[54] = amd_bytealign (w[ 5], w[ 6], offset); + w[53] = amd_bytealign (w[ 4], w[ 5], offset); + w[52] = amd_bytealign (w[ 3], w[ 4], offset); + w[51] = amd_bytealign (w[ 2], w[ 3], offset); + w[50] = amd_bytealign (w[ 1], w[ 2], offset); + w[49] = amd_bytealign (w[ 0], w[ 1], offset); + w[48] = amd_bytealign ( 0, w[ 0], offset); + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 49: + w[63] = amd_bytealign (w[13], w[14], offset); + w[62] = amd_bytealign (w[12], w[13], offset); + w[61] = amd_bytealign (w[11], w[12], offset); + w[60] = amd_bytealign (w[10], w[11], offset); + w[59] = amd_bytealign (w[ 9], w[10], offset); + w[58] = amd_bytealign (w[ 8], w[ 9], offset); + w[57] = amd_bytealign (w[ 7], w[ 8], offset); + w[56] = amd_bytealign (w[ 6], w[ 7], offset); + w[55] = amd_bytealign (w[ 5], w[ 6], offset); + w[54] = amd_bytealign (w[ 4], w[ 5], offset); + w[53] = amd_bytealign (w[ 3], w[ 4], offset); + w[52] = amd_bytealign (w[ 2], w[ 3], offset); + w[51] = amd_bytealign (w[ 1], w[ 2], offset); + w[50] = amd_bytealign (w[ 0], w[ 1], offset); + w[49] = amd_bytealign ( 0, w[ 0], offset); + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 50: + w[63] = amd_bytealign (w[12], w[13], offset); + w[62] = amd_bytealign (w[11], w[12], offset); + w[61] = amd_bytealign (w[10], w[11], offset); + w[60] = amd_bytealign (w[ 9], w[10], offset); + w[59] = amd_bytealign (w[ 8], w[ 9], offset); + w[58] = amd_bytealign (w[ 7], w[ 8], offset); + w[57] = amd_bytealign (w[ 6], w[ 7], offset); + w[56] = amd_bytealign (w[ 5], w[ 6], offset); + w[55] = amd_bytealign (w[ 4], w[ 5], offset); + w[54] = amd_bytealign (w[ 3], w[ 4], offset); + w[53] = amd_bytealign (w[ 2], w[ 3], offset); + w[52] = amd_bytealign (w[ 1], w[ 2], offset); + w[51] = amd_bytealign (w[ 0], w[ 1], offset); + w[50] = amd_bytealign ( 0, w[ 0], offset); + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 51: + w[63] = amd_bytealign (w[11], w[12], offset); + w[62] = amd_bytealign (w[10], w[11], offset); + w[61] = amd_bytealign (w[ 9], w[10], offset); + w[60] = amd_bytealign (w[ 8], w[ 9], offset); + w[59] = amd_bytealign (w[ 7], w[ 8], offset); + w[58] = amd_bytealign (w[ 6], w[ 7], offset); + w[57] = amd_bytealign (w[ 5], w[ 6], offset); + w[56] = amd_bytealign (w[ 4], w[ 5], offset); + w[55] = amd_bytealign (w[ 3], w[ 4], offset); + w[54] = amd_bytealign (w[ 2], w[ 3], offset); + w[53] = amd_bytealign (w[ 1], w[ 2], offset); + w[52] = amd_bytealign (w[ 0], w[ 1], offset); + w[51] = amd_bytealign ( 0, w[ 0], offset); + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 52: + w[63] = amd_bytealign (w[10], w[11], offset); + w[62] = amd_bytealign (w[ 9], w[10], offset); + w[61] = amd_bytealign (w[ 8], w[ 9], offset); + w[60] = amd_bytealign (w[ 7], w[ 8], offset); + w[59] = amd_bytealign (w[ 6], w[ 7], offset); + w[58] = amd_bytealign (w[ 5], w[ 6], offset); + w[57] = amd_bytealign (w[ 4], w[ 5], offset); + w[56] = amd_bytealign (w[ 3], w[ 4], offset); + w[55] = amd_bytealign (w[ 2], w[ 3], offset); + w[54] = amd_bytealign (w[ 1], w[ 2], offset); + w[53] = amd_bytealign (w[ 0], w[ 1], offset); + w[52] = amd_bytealign ( 0, w[ 0], offset); + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 53: + w[63] = amd_bytealign (w[ 9], w[10], offset); + w[62] = amd_bytealign (w[ 8], w[ 9], offset); + w[61] = amd_bytealign (w[ 7], w[ 8], offset); + w[60] = amd_bytealign (w[ 6], w[ 7], offset); + w[59] = amd_bytealign (w[ 5], w[ 6], offset); + w[58] = amd_bytealign (w[ 4], w[ 5], offset); + w[57] = amd_bytealign (w[ 3], w[ 4], offset); + w[56] = amd_bytealign (w[ 2], w[ 3], offset); + w[55] = amd_bytealign (w[ 1], w[ 2], offset); + w[54] = amd_bytealign (w[ 0], w[ 1], offset); + w[53] = amd_bytealign ( 0, w[ 0], offset); + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 54: + w[63] = amd_bytealign (w[ 8], w[ 9], offset); + w[62] = amd_bytealign (w[ 7], w[ 8], offset); + w[61] = amd_bytealign (w[ 6], w[ 7], offset); + w[60] = amd_bytealign (w[ 5], w[ 6], offset); + w[59] = amd_bytealign (w[ 4], w[ 5], offset); + w[58] = amd_bytealign (w[ 3], w[ 4], offset); + w[57] = amd_bytealign (w[ 2], w[ 3], offset); + w[56] = amd_bytealign (w[ 1], w[ 2], offset); + w[55] = amd_bytealign (w[ 0], w[ 1], offset); + w[54] = amd_bytealign ( 0, w[ 0], offset); + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 55: + w[63] = amd_bytealign (w[ 7], w[ 8], offset); + w[62] = amd_bytealign (w[ 6], w[ 7], offset); + w[61] = amd_bytealign (w[ 5], w[ 6], offset); + w[60] = amd_bytealign (w[ 4], w[ 5], offset); + w[59] = amd_bytealign (w[ 3], w[ 4], offset); + w[58] = amd_bytealign (w[ 2], w[ 3], offset); + w[57] = amd_bytealign (w[ 1], w[ 2], offset); + w[56] = amd_bytealign (w[ 0], w[ 1], offset); + w[55] = amd_bytealign ( 0, w[ 0], offset); + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 56: + w[63] = amd_bytealign (w[ 6], w[ 7], offset); + w[62] = amd_bytealign (w[ 5], w[ 6], offset); + w[61] = amd_bytealign (w[ 4], w[ 5], offset); + w[60] = amd_bytealign (w[ 3], w[ 4], offset); + w[59] = amd_bytealign (w[ 2], w[ 3], offset); + w[58] = amd_bytealign (w[ 1], w[ 2], offset); + w[57] = amd_bytealign (w[ 0], w[ 1], offset); + w[56] = amd_bytealign ( 0, w[ 0], offset); + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 57: + w[63] = amd_bytealign (w[ 5], w[ 6], offset); + w[62] = amd_bytealign (w[ 4], w[ 5], offset); + w[61] = amd_bytealign (w[ 3], w[ 4], offset); + w[60] = amd_bytealign (w[ 2], w[ 3], offset); + w[59] = amd_bytealign (w[ 1], w[ 2], offset); + w[58] = amd_bytealign (w[ 0], w[ 1], offset); + w[57] = amd_bytealign ( 0, w[ 0], offset); + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 58: + w[63] = amd_bytealign (w[ 4], w[ 5], offset); + w[62] = amd_bytealign (w[ 3], w[ 4], offset); + w[61] = amd_bytealign (w[ 2], w[ 3], offset); + w[60] = amd_bytealign (w[ 1], w[ 2], offset); + w[59] = amd_bytealign (w[ 0], w[ 1], offset); + w[58] = amd_bytealign ( 0, w[ 0], offset); + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 59: + w[63] = amd_bytealign (w[ 3], w[ 4], offset); + w[62] = amd_bytealign (w[ 2], w[ 3], offset); + w[61] = amd_bytealign (w[ 1], w[ 2], offset); + w[60] = amd_bytealign (w[ 0], w[ 1], offset); + w[59] = amd_bytealign ( 0, w[ 0], offset); + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 60: + w[63] = amd_bytealign (w[ 2], w[ 3], offset); + w[62] = amd_bytealign (w[ 1], w[ 2], offset); + w[61] = amd_bytealign (w[ 0], w[ 1], offset); + w[60] = amd_bytealign ( 0, w[ 0], offset); + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 61: + w[63] = amd_bytealign (w[ 1], w[ 2], offset); + w[62] = amd_bytealign (w[ 0], w[ 1], offset); + w[61] = amd_bytealign ( 0, w[ 0], offset); + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 62: + w[63] = amd_bytealign (w[ 0], w[ 1], offset); + w[62] = amd_bytealign ( 0, w[ 0], offset); + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 63: + w[63] = amd_bytealign ( 0, w[ 0], offset); + w[62] = 0; + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + } + #endif + + #ifdef IS_NV + const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + + switch (offset / 4) + { + case 0: + w[63] = __byte_perm (w[63], w[62], selector); + w[62] = __byte_perm (w[62], w[61], selector); + w[61] = __byte_perm (w[61], w[60], selector); + w[60] = __byte_perm (w[60], w[59], selector); + w[59] = __byte_perm (w[59], w[58], selector); + w[58] = __byte_perm (w[58], w[57], selector); + w[57] = __byte_perm (w[57], w[56], selector); + w[56] = __byte_perm (w[56], w[55], selector); + w[55] = __byte_perm (w[55], w[54], selector); + w[54] = __byte_perm (w[54], w[53], selector); + w[53] = __byte_perm (w[53], w[52], selector); + w[52] = __byte_perm (w[52], w[51], selector); + w[51] = __byte_perm (w[51], w[50], selector); + w[50] = __byte_perm (w[50], w[49], selector); + w[49] = __byte_perm (w[49], w[48], selector); + w[48] = __byte_perm (w[48], w[47], selector); + w[47] = __byte_perm (w[47], w[46], selector); + w[46] = __byte_perm (w[46], w[45], selector); + w[45] = __byte_perm (w[45], w[44], selector); + w[44] = __byte_perm (w[44], w[43], selector); + w[43] = __byte_perm (w[43], w[42], selector); + w[42] = __byte_perm (w[42], w[41], selector); + w[41] = __byte_perm (w[41], w[40], selector); + w[40] = __byte_perm (w[40], w[39], selector); + w[39] = __byte_perm (w[39], w[38], selector); + w[38] = __byte_perm (w[38], w[37], selector); + w[37] = __byte_perm (w[37], w[36], selector); + w[36] = __byte_perm (w[36], w[35], selector); + w[35] = __byte_perm (w[35], w[34], selector); + w[34] = __byte_perm (w[34], w[33], selector); + w[33] = __byte_perm (w[33], w[32], selector); + w[32] = __byte_perm (w[32], w[31], selector); + w[31] = __byte_perm (w[31], w[30], selector); + w[30] = __byte_perm (w[30], w[29], selector); + w[29] = __byte_perm (w[29], w[28], selector); + w[28] = __byte_perm (w[28], w[27], selector); + w[27] = __byte_perm (w[27], w[26], selector); + w[26] = __byte_perm (w[26], w[25], selector); + w[25] = __byte_perm (w[25], w[24], selector); + w[24] = __byte_perm (w[24], w[23], selector); + w[23] = __byte_perm (w[23], w[22], selector); + w[22] = __byte_perm (w[22], w[21], selector); + w[21] = __byte_perm (w[21], w[20], selector); + w[20] = __byte_perm (w[20], w[19], selector); + w[19] = __byte_perm (w[19], w[18], selector); + w[18] = __byte_perm (w[18], w[17], selector); + w[17] = __byte_perm (w[17], w[16], selector); + w[16] = __byte_perm (w[16], w[15], selector); + w[15] = __byte_perm (w[15], w[14], selector); + w[14] = __byte_perm (w[14], w[13], selector); + w[13] = __byte_perm (w[13], w[12], selector); + w[12] = __byte_perm (w[12], w[11], selector); + w[11] = __byte_perm (w[11], w[10], selector); + w[10] = __byte_perm (w[10], w[ 9], selector); + w[ 9] = __byte_perm (w[ 9], w[ 8], selector); + w[ 8] = __byte_perm (w[ 8], w[ 7], selector); + w[ 7] = __byte_perm (w[ 7], w[ 6], selector); + w[ 6] = __byte_perm (w[ 6], w[ 5], selector); + w[ 5] = __byte_perm (w[ 5], w[ 4], selector); + w[ 4] = __byte_perm (w[ 4], w[ 3], selector); + w[ 3] = __byte_perm (w[ 3], w[ 2], selector); + w[ 2] = __byte_perm (w[ 2], w[ 1], selector); + w[ 1] = __byte_perm (w[ 1], w[ 0], selector); + w[ 0] = __byte_perm (w[ 0], 0, selector); + + break; + + case 1: + w[63] = __byte_perm (w[62], w[61], selector); + w[62] = __byte_perm (w[61], w[60], selector); + w[61] = __byte_perm (w[60], w[59], selector); + w[60] = __byte_perm (w[59], w[58], selector); + w[59] = __byte_perm (w[58], w[57], selector); + w[58] = __byte_perm (w[57], w[56], selector); + w[57] = __byte_perm (w[56], w[55], selector); + w[56] = __byte_perm (w[55], w[54], selector); + w[55] = __byte_perm (w[54], w[53], selector); + w[54] = __byte_perm (w[53], w[52], selector); + w[53] = __byte_perm (w[52], w[51], selector); + w[52] = __byte_perm (w[51], w[50], selector); + w[51] = __byte_perm (w[50], w[49], selector); + w[50] = __byte_perm (w[49], w[48], selector); + w[49] = __byte_perm (w[48], w[47], selector); + w[48] = __byte_perm (w[47], w[46], selector); + w[47] = __byte_perm (w[46], w[45], selector); + w[46] = __byte_perm (w[45], w[44], selector); + w[45] = __byte_perm (w[44], w[43], selector); + w[44] = __byte_perm (w[43], w[42], selector); + w[43] = __byte_perm (w[42], w[41], selector); + w[42] = __byte_perm (w[41], w[40], selector); + w[41] = __byte_perm (w[40], w[39], selector); + w[40] = __byte_perm (w[39], w[38], selector); + w[39] = __byte_perm (w[38], w[37], selector); + w[38] = __byte_perm (w[37], w[36], selector); + w[37] = __byte_perm (w[36], w[35], selector); + w[36] = __byte_perm (w[35], w[34], selector); + w[35] = __byte_perm (w[34], w[33], selector); + w[34] = __byte_perm (w[33], w[32], selector); + w[33] = __byte_perm (w[32], w[31], selector); + w[32] = __byte_perm (w[31], w[30], selector); + w[31] = __byte_perm (w[30], w[29], selector); + w[30] = __byte_perm (w[29], w[28], selector); + w[29] = __byte_perm (w[28], w[27], selector); + w[28] = __byte_perm (w[27], w[26], selector); + w[27] = __byte_perm (w[26], w[25], selector); + w[26] = __byte_perm (w[25], w[24], selector); + w[25] = __byte_perm (w[24], w[23], selector); + w[24] = __byte_perm (w[23], w[22], selector); + w[23] = __byte_perm (w[22], w[21], selector); + w[22] = __byte_perm (w[21], w[20], selector); + w[21] = __byte_perm (w[20], w[19], selector); + w[20] = __byte_perm (w[19], w[18], selector); + w[19] = __byte_perm (w[18], w[17], selector); + w[18] = __byte_perm (w[17], w[16], selector); + w[17] = __byte_perm (w[16], w[15], selector); + w[16] = __byte_perm (w[15], w[14], selector); + w[15] = __byte_perm (w[14], w[13], selector); + w[14] = __byte_perm (w[13], w[12], selector); + w[13] = __byte_perm (w[12], w[11], selector); + w[12] = __byte_perm (w[11], w[10], selector); + w[11] = __byte_perm (w[10], w[ 9], selector); + w[10] = __byte_perm (w[ 9], w[ 8], selector); + w[ 9] = __byte_perm (w[ 8], w[ 7], selector); + w[ 8] = __byte_perm (w[ 7], w[ 6], selector); + w[ 7] = __byte_perm (w[ 6], w[ 5], selector); + w[ 6] = __byte_perm (w[ 5], w[ 4], selector); + w[ 5] = __byte_perm (w[ 4], w[ 3], selector); + w[ 4] = __byte_perm (w[ 3], w[ 2], selector); + w[ 3] = __byte_perm (w[ 2], w[ 1], selector); + w[ 2] = __byte_perm (w[ 1], w[ 0], selector); + w[ 1] = __byte_perm (w[ 0], 0, selector); + w[ 0] = 0; + + break; + + case 2: + w[63] = __byte_perm (w[61], w[60], selector); + w[62] = __byte_perm (w[60], w[59], selector); + w[61] = __byte_perm (w[59], w[58], selector); + w[60] = __byte_perm (w[58], w[57], selector); + w[59] = __byte_perm (w[57], w[56], selector); + w[58] = __byte_perm (w[56], w[55], selector); + w[57] = __byte_perm (w[55], w[54], selector); + w[56] = __byte_perm (w[54], w[53], selector); + w[55] = __byte_perm (w[53], w[52], selector); + w[54] = __byte_perm (w[52], w[51], selector); + w[53] = __byte_perm (w[51], w[50], selector); + w[52] = __byte_perm (w[50], w[49], selector); + w[51] = __byte_perm (w[49], w[48], selector); + w[50] = __byte_perm (w[48], w[47], selector); + w[49] = __byte_perm (w[47], w[46], selector); + w[48] = __byte_perm (w[46], w[45], selector); + w[47] = __byte_perm (w[45], w[44], selector); + w[46] = __byte_perm (w[44], w[43], selector); + w[45] = __byte_perm (w[43], w[42], selector); + w[44] = __byte_perm (w[42], w[41], selector); + w[43] = __byte_perm (w[41], w[40], selector); + w[42] = __byte_perm (w[40], w[39], selector); + w[41] = __byte_perm (w[39], w[38], selector); + w[40] = __byte_perm (w[38], w[37], selector); + w[39] = __byte_perm (w[37], w[36], selector); + w[38] = __byte_perm (w[36], w[35], selector); + w[37] = __byte_perm (w[35], w[34], selector); + w[36] = __byte_perm (w[34], w[33], selector); + w[35] = __byte_perm (w[33], w[32], selector); + w[34] = __byte_perm (w[32], w[31], selector); + w[33] = __byte_perm (w[31], w[30], selector); + w[32] = __byte_perm (w[30], w[29], selector); + w[31] = __byte_perm (w[29], w[28], selector); + w[30] = __byte_perm (w[28], w[27], selector); + w[29] = __byte_perm (w[27], w[26], selector); + w[28] = __byte_perm (w[26], w[25], selector); + w[27] = __byte_perm (w[25], w[24], selector); + w[26] = __byte_perm (w[24], w[23], selector); + w[25] = __byte_perm (w[23], w[22], selector); + w[24] = __byte_perm (w[22], w[21], selector); + w[23] = __byte_perm (w[21], w[20], selector); + w[22] = __byte_perm (w[20], w[19], selector); + w[21] = __byte_perm (w[19], w[18], selector); + w[20] = __byte_perm (w[18], w[17], selector); + w[19] = __byte_perm (w[17], w[16], selector); + w[18] = __byte_perm (w[16], w[15], selector); + w[17] = __byte_perm (w[15], w[14], selector); + w[16] = __byte_perm (w[14], w[13], selector); + w[15] = __byte_perm (w[13], w[12], selector); + w[14] = __byte_perm (w[12], w[11], selector); + w[13] = __byte_perm (w[11], w[10], selector); + w[12] = __byte_perm (w[10], w[ 9], selector); + w[11] = __byte_perm (w[ 9], w[ 8], selector); + w[10] = __byte_perm (w[ 8], w[ 7], selector); + w[ 9] = __byte_perm (w[ 7], w[ 6], selector); + w[ 8] = __byte_perm (w[ 6], w[ 5], selector); + w[ 7] = __byte_perm (w[ 5], w[ 4], selector); + w[ 6] = __byte_perm (w[ 4], w[ 3], selector); + w[ 5] = __byte_perm (w[ 3], w[ 2], selector); + w[ 4] = __byte_perm (w[ 2], w[ 1], selector); + w[ 3] = __byte_perm (w[ 1], w[ 0], selector); + w[ 2] = __byte_perm (w[ 0], 0, selector); + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 3: + w[63] = __byte_perm (w[60], w[59], selector); + w[62] = __byte_perm (w[59], w[58], selector); + w[61] = __byte_perm (w[58], w[57], selector); + w[60] = __byte_perm (w[57], w[56], selector); + w[59] = __byte_perm (w[56], w[55], selector); + w[58] = __byte_perm (w[55], w[54], selector); + w[57] = __byte_perm (w[54], w[53], selector); + w[56] = __byte_perm (w[53], w[52], selector); + w[55] = __byte_perm (w[52], w[51], selector); + w[54] = __byte_perm (w[51], w[50], selector); + w[53] = __byte_perm (w[50], w[49], selector); + w[52] = __byte_perm (w[49], w[48], selector); + w[51] = __byte_perm (w[48], w[47], selector); + w[50] = __byte_perm (w[47], w[46], selector); + w[49] = __byte_perm (w[46], w[45], selector); + w[48] = __byte_perm (w[45], w[44], selector); + w[47] = __byte_perm (w[44], w[43], selector); + w[46] = __byte_perm (w[43], w[42], selector); + w[45] = __byte_perm (w[42], w[41], selector); + w[44] = __byte_perm (w[41], w[40], selector); + w[43] = __byte_perm (w[40], w[39], selector); + w[42] = __byte_perm (w[39], w[38], selector); + w[41] = __byte_perm (w[38], w[37], selector); + w[40] = __byte_perm (w[37], w[36], selector); + w[39] = __byte_perm (w[36], w[35], selector); + w[38] = __byte_perm (w[35], w[34], selector); + w[37] = __byte_perm (w[34], w[33], selector); + w[36] = __byte_perm (w[33], w[32], selector); + w[35] = __byte_perm (w[32], w[31], selector); + w[34] = __byte_perm (w[31], w[30], selector); + w[33] = __byte_perm (w[30], w[29], selector); + w[32] = __byte_perm (w[29], w[28], selector); + w[31] = __byte_perm (w[28], w[27], selector); + w[30] = __byte_perm (w[27], w[26], selector); + w[29] = __byte_perm (w[26], w[25], selector); + w[28] = __byte_perm (w[25], w[24], selector); + w[27] = __byte_perm (w[24], w[23], selector); + w[26] = __byte_perm (w[23], w[22], selector); + w[25] = __byte_perm (w[22], w[21], selector); + w[24] = __byte_perm (w[21], w[20], selector); + w[23] = __byte_perm (w[20], w[19], selector); + w[22] = __byte_perm (w[19], w[18], selector); + w[21] = __byte_perm (w[18], w[17], selector); + w[20] = __byte_perm (w[17], w[16], selector); + w[19] = __byte_perm (w[16], w[15], selector); + w[18] = __byte_perm (w[15], w[14], selector); + w[17] = __byte_perm (w[14], w[13], selector); + w[16] = __byte_perm (w[13], w[12], selector); + w[15] = __byte_perm (w[12], w[11], selector); + w[14] = __byte_perm (w[11], w[10], selector); + w[13] = __byte_perm (w[10], w[ 9], selector); + w[12] = __byte_perm (w[ 9], w[ 8], selector); + w[11] = __byte_perm (w[ 8], w[ 7], selector); + w[10] = __byte_perm (w[ 7], w[ 6], selector); + w[ 9] = __byte_perm (w[ 6], w[ 5], selector); + w[ 8] = __byte_perm (w[ 5], w[ 4], selector); + w[ 7] = __byte_perm (w[ 4], w[ 3], selector); + w[ 6] = __byte_perm (w[ 3], w[ 2], selector); + w[ 5] = __byte_perm (w[ 2], w[ 1], selector); + w[ 4] = __byte_perm (w[ 1], w[ 0], selector); + w[ 3] = __byte_perm (w[ 0], 0, selector); + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 4: + w[63] = __byte_perm (w[59], w[58], selector); + w[62] = __byte_perm (w[58], w[57], selector); + w[61] = __byte_perm (w[57], w[56], selector); + w[60] = __byte_perm (w[56], w[55], selector); + w[59] = __byte_perm (w[55], w[54], selector); + w[58] = __byte_perm (w[54], w[53], selector); + w[57] = __byte_perm (w[53], w[52], selector); + w[56] = __byte_perm (w[52], w[51], selector); + w[55] = __byte_perm (w[51], w[50], selector); + w[54] = __byte_perm (w[50], w[49], selector); + w[53] = __byte_perm (w[49], w[48], selector); + w[52] = __byte_perm (w[48], w[47], selector); + w[51] = __byte_perm (w[47], w[46], selector); + w[50] = __byte_perm (w[46], w[45], selector); + w[49] = __byte_perm (w[45], w[44], selector); + w[48] = __byte_perm (w[44], w[43], selector); + w[47] = __byte_perm (w[43], w[42], selector); + w[46] = __byte_perm (w[42], w[41], selector); + w[45] = __byte_perm (w[41], w[40], selector); + w[44] = __byte_perm (w[40], w[39], selector); + w[43] = __byte_perm (w[39], w[38], selector); + w[42] = __byte_perm (w[38], w[37], selector); + w[41] = __byte_perm (w[37], w[36], selector); + w[40] = __byte_perm (w[36], w[35], selector); + w[39] = __byte_perm (w[35], w[34], selector); + w[38] = __byte_perm (w[34], w[33], selector); + w[37] = __byte_perm (w[33], w[32], selector); + w[36] = __byte_perm (w[32], w[31], selector); + w[35] = __byte_perm (w[31], w[30], selector); + w[34] = __byte_perm (w[30], w[29], selector); + w[33] = __byte_perm (w[29], w[28], selector); + w[32] = __byte_perm (w[28], w[27], selector); + w[31] = __byte_perm (w[27], w[26], selector); + w[30] = __byte_perm (w[26], w[25], selector); + w[29] = __byte_perm (w[25], w[24], selector); + w[28] = __byte_perm (w[24], w[23], selector); + w[27] = __byte_perm (w[23], w[22], selector); + w[26] = __byte_perm (w[22], w[21], selector); + w[25] = __byte_perm (w[21], w[20], selector); + w[24] = __byte_perm (w[20], w[19], selector); + w[23] = __byte_perm (w[19], w[18], selector); + w[22] = __byte_perm (w[18], w[17], selector); + w[21] = __byte_perm (w[17], w[16], selector); + w[20] = __byte_perm (w[16], w[15], selector); + w[19] = __byte_perm (w[15], w[14], selector); + w[18] = __byte_perm (w[14], w[13], selector); + w[17] = __byte_perm (w[13], w[12], selector); + w[16] = __byte_perm (w[12], w[11], selector); + w[15] = __byte_perm (w[11], w[10], selector); + w[14] = __byte_perm (w[10], w[ 9], selector); + w[13] = __byte_perm (w[ 9], w[ 8], selector); + w[12] = __byte_perm (w[ 8], w[ 7], selector); + w[11] = __byte_perm (w[ 7], w[ 6], selector); + w[10] = __byte_perm (w[ 6], w[ 5], selector); + w[ 9] = __byte_perm (w[ 5], w[ 4], selector); + w[ 8] = __byte_perm (w[ 4], w[ 3], selector); + w[ 7] = __byte_perm (w[ 3], w[ 2], selector); + w[ 6] = __byte_perm (w[ 2], w[ 1], selector); + w[ 5] = __byte_perm (w[ 1], w[ 0], selector); + w[ 4] = __byte_perm (w[ 0], 0, selector); + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 5: + w[63] = __byte_perm (w[58], w[57], selector); + w[62] = __byte_perm (w[57], w[56], selector); + w[61] = __byte_perm (w[56], w[55], selector); + w[60] = __byte_perm (w[55], w[54], selector); + w[59] = __byte_perm (w[54], w[53], selector); + w[58] = __byte_perm (w[53], w[52], selector); + w[57] = __byte_perm (w[52], w[51], selector); + w[56] = __byte_perm (w[51], w[50], selector); + w[55] = __byte_perm (w[50], w[49], selector); + w[54] = __byte_perm (w[49], w[48], selector); + w[53] = __byte_perm (w[48], w[47], selector); + w[52] = __byte_perm (w[47], w[46], selector); + w[51] = __byte_perm (w[46], w[45], selector); + w[50] = __byte_perm (w[45], w[44], selector); + w[49] = __byte_perm (w[44], w[43], selector); + w[48] = __byte_perm (w[43], w[42], selector); + w[47] = __byte_perm (w[42], w[41], selector); + w[46] = __byte_perm (w[41], w[40], selector); + w[45] = __byte_perm (w[40], w[39], selector); + w[44] = __byte_perm (w[39], w[38], selector); + w[43] = __byte_perm (w[38], w[37], selector); + w[42] = __byte_perm (w[37], w[36], selector); + w[41] = __byte_perm (w[36], w[35], selector); + w[40] = __byte_perm (w[35], w[34], selector); + w[39] = __byte_perm (w[34], w[33], selector); + w[38] = __byte_perm (w[33], w[32], selector); + w[37] = __byte_perm (w[32], w[31], selector); + w[36] = __byte_perm (w[31], w[30], selector); + w[35] = __byte_perm (w[30], w[29], selector); + w[34] = __byte_perm (w[29], w[28], selector); + w[33] = __byte_perm (w[28], w[27], selector); + w[32] = __byte_perm (w[27], w[26], selector); + w[31] = __byte_perm (w[26], w[25], selector); + w[30] = __byte_perm (w[25], w[24], selector); + w[29] = __byte_perm (w[24], w[23], selector); + w[28] = __byte_perm (w[23], w[22], selector); + w[27] = __byte_perm (w[22], w[21], selector); + w[26] = __byte_perm (w[21], w[20], selector); + w[25] = __byte_perm (w[20], w[19], selector); + w[24] = __byte_perm (w[19], w[18], selector); + w[23] = __byte_perm (w[18], w[17], selector); + w[22] = __byte_perm (w[17], w[16], selector); + w[21] = __byte_perm (w[16], w[15], selector); + w[20] = __byte_perm (w[15], w[14], selector); + w[19] = __byte_perm (w[14], w[13], selector); + w[18] = __byte_perm (w[13], w[12], selector); + w[17] = __byte_perm (w[12], w[11], selector); + w[16] = __byte_perm (w[11], w[10], selector); + w[15] = __byte_perm (w[10], w[ 9], selector); + w[14] = __byte_perm (w[ 9], w[ 8], selector); + w[13] = __byte_perm (w[ 8], w[ 7], selector); + w[12] = __byte_perm (w[ 7], w[ 6], selector); + w[11] = __byte_perm (w[ 6], w[ 5], selector); + w[10] = __byte_perm (w[ 5], w[ 4], selector); + w[ 9] = __byte_perm (w[ 4], w[ 3], selector); + w[ 8] = __byte_perm (w[ 3], w[ 2], selector); + w[ 7] = __byte_perm (w[ 2], w[ 1], selector); + w[ 6] = __byte_perm (w[ 1], w[ 0], selector); + w[ 5] = __byte_perm (w[ 0], 0, selector); + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 6: + w[63] = __byte_perm (w[57], w[56], selector); + w[62] = __byte_perm (w[56], w[55], selector); + w[61] = __byte_perm (w[55], w[54], selector); + w[60] = __byte_perm (w[54], w[53], selector); + w[59] = __byte_perm (w[53], w[52], selector); + w[58] = __byte_perm (w[52], w[51], selector); + w[57] = __byte_perm (w[51], w[50], selector); + w[56] = __byte_perm (w[50], w[49], selector); + w[55] = __byte_perm (w[49], w[48], selector); + w[54] = __byte_perm (w[48], w[47], selector); + w[53] = __byte_perm (w[47], w[46], selector); + w[52] = __byte_perm (w[46], w[45], selector); + w[51] = __byte_perm (w[45], w[44], selector); + w[50] = __byte_perm (w[44], w[43], selector); + w[49] = __byte_perm (w[43], w[42], selector); + w[48] = __byte_perm (w[42], w[41], selector); + w[47] = __byte_perm (w[41], w[40], selector); + w[46] = __byte_perm (w[40], w[39], selector); + w[45] = __byte_perm (w[39], w[38], selector); + w[44] = __byte_perm (w[38], w[37], selector); + w[43] = __byte_perm (w[37], w[36], selector); + w[42] = __byte_perm (w[36], w[35], selector); + w[41] = __byte_perm (w[35], w[34], selector); + w[40] = __byte_perm (w[34], w[33], selector); + w[39] = __byte_perm (w[33], w[32], selector); + w[38] = __byte_perm (w[32], w[31], selector); + w[37] = __byte_perm (w[31], w[30], selector); + w[36] = __byte_perm (w[30], w[29], selector); + w[35] = __byte_perm (w[29], w[28], selector); + w[34] = __byte_perm (w[28], w[27], selector); + w[33] = __byte_perm (w[27], w[26], selector); + w[32] = __byte_perm (w[26], w[25], selector); + w[31] = __byte_perm (w[25], w[24], selector); + w[30] = __byte_perm (w[24], w[23], selector); + w[29] = __byte_perm (w[23], w[22], selector); + w[28] = __byte_perm (w[22], w[21], selector); + w[27] = __byte_perm (w[21], w[20], selector); + w[26] = __byte_perm (w[20], w[19], selector); + w[25] = __byte_perm (w[19], w[18], selector); + w[24] = __byte_perm (w[18], w[17], selector); + w[23] = __byte_perm (w[17], w[16], selector); + w[22] = __byte_perm (w[16], w[15], selector); + w[21] = __byte_perm (w[15], w[14], selector); + w[20] = __byte_perm (w[14], w[13], selector); + w[19] = __byte_perm (w[13], w[12], selector); + w[18] = __byte_perm (w[12], w[11], selector); + w[17] = __byte_perm (w[11], w[10], selector); + w[16] = __byte_perm (w[10], w[ 9], selector); + w[15] = __byte_perm (w[ 9], w[ 8], selector); + w[14] = __byte_perm (w[ 8], w[ 7], selector); + w[13] = __byte_perm (w[ 7], w[ 6], selector); + w[12] = __byte_perm (w[ 6], w[ 5], selector); + w[11] = __byte_perm (w[ 5], w[ 4], selector); + w[10] = __byte_perm (w[ 4], w[ 3], selector); + w[ 9] = __byte_perm (w[ 3], w[ 2], selector); + w[ 8] = __byte_perm (w[ 2], w[ 1], selector); + w[ 7] = __byte_perm (w[ 1], w[ 0], selector); + w[ 6] = __byte_perm (w[ 0], 0, selector); + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 7: + w[63] = __byte_perm (w[56], w[55], selector); + w[62] = __byte_perm (w[55], w[54], selector); + w[61] = __byte_perm (w[54], w[53], selector); + w[60] = __byte_perm (w[53], w[52], selector); + w[59] = __byte_perm (w[52], w[51], selector); + w[58] = __byte_perm (w[51], w[50], selector); + w[57] = __byte_perm (w[50], w[49], selector); + w[56] = __byte_perm (w[49], w[48], selector); + w[55] = __byte_perm (w[48], w[47], selector); + w[54] = __byte_perm (w[47], w[46], selector); + w[53] = __byte_perm (w[46], w[45], selector); + w[52] = __byte_perm (w[45], w[44], selector); + w[51] = __byte_perm (w[44], w[43], selector); + w[50] = __byte_perm (w[43], w[42], selector); + w[49] = __byte_perm (w[42], w[41], selector); + w[48] = __byte_perm (w[41], w[40], selector); + w[47] = __byte_perm (w[40], w[39], selector); + w[46] = __byte_perm (w[39], w[38], selector); + w[45] = __byte_perm (w[38], w[37], selector); + w[44] = __byte_perm (w[37], w[36], selector); + w[43] = __byte_perm (w[36], w[35], selector); + w[42] = __byte_perm (w[35], w[34], selector); + w[41] = __byte_perm (w[34], w[33], selector); + w[40] = __byte_perm (w[33], w[32], selector); + w[39] = __byte_perm (w[32], w[31], selector); + w[38] = __byte_perm (w[31], w[30], selector); + w[37] = __byte_perm (w[30], w[29], selector); + w[36] = __byte_perm (w[29], w[28], selector); + w[35] = __byte_perm (w[28], w[27], selector); + w[34] = __byte_perm (w[27], w[26], selector); + w[33] = __byte_perm (w[26], w[25], selector); + w[32] = __byte_perm (w[25], w[24], selector); + w[31] = __byte_perm (w[24], w[23], selector); + w[30] = __byte_perm (w[23], w[22], selector); + w[29] = __byte_perm (w[22], w[21], selector); + w[28] = __byte_perm (w[21], w[20], selector); + w[27] = __byte_perm (w[20], w[19], selector); + w[26] = __byte_perm (w[19], w[18], selector); + w[25] = __byte_perm (w[18], w[17], selector); + w[24] = __byte_perm (w[17], w[16], selector); + w[23] = __byte_perm (w[16], w[15], selector); + w[22] = __byte_perm (w[15], w[14], selector); + w[21] = __byte_perm (w[14], w[13], selector); + w[20] = __byte_perm (w[13], w[12], selector); + w[19] = __byte_perm (w[12], w[11], selector); + w[18] = __byte_perm (w[11], w[10], selector); + w[17] = __byte_perm (w[10], w[ 9], selector); + w[16] = __byte_perm (w[ 9], w[ 8], selector); + w[15] = __byte_perm (w[ 8], w[ 7], selector); + w[14] = __byte_perm (w[ 7], w[ 6], selector); + w[13] = __byte_perm (w[ 6], w[ 5], selector); + w[12] = __byte_perm (w[ 5], w[ 4], selector); + w[11] = __byte_perm (w[ 4], w[ 3], selector); + w[10] = __byte_perm (w[ 3], w[ 2], selector); + w[ 9] = __byte_perm (w[ 2], w[ 1], selector); + w[ 8] = __byte_perm (w[ 1], w[ 0], selector); + w[ 7] = __byte_perm (w[ 0], 0, selector); + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 8: + w[63] = __byte_perm (w[55], w[54], selector); + w[62] = __byte_perm (w[54], w[53], selector); + w[61] = __byte_perm (w[53], w[52], selector); + w[60] = __byte_perm (w[52], w[51], selector); + w[59] = __byte_perm (w[51], w[50], selector); + w[58] = __byte_perm (w[50], w[49], selector); + w[57] = __byte_perm (w[49], w[48], selector); + w[56] = __byte_perm (w[48], w[47], selector); + w[55] = __byte_perm (w[47], w[46], selector); + w[54] = __byte_perm (w[46], w[45], selector); + w[53] = __byte_perm (w[45], w[44], selector); + w[52] = __byte_perm (w[44], w[43], selector); + w[51] = __byte_perm (w[43], w[42], selector); + w[50] = __byte_perm (w[42], w[41], selector); + w[49] = __byte_perm (w[41], w[40], selector); + w[48] = __byte_perm (w[40], w[39], selector); + w[47] = __byte_perm (w[39], w[38], selector); + w[46] = __byte_perm (w[38], w[37], selector); + w[45] = __byte_perm (w[37], w[36], selector); + w[44] = __byte_perm (w[36], w[35], selector); + w[43] = __byte_perm (w[35], w[34], selector); + w[42] = __byte_perm (w[34], w[33], selector); + w[41] = __byte_perm (w[33], w[32], selector); + w[40] = __byte_perm (w[32], w[31], selector); + w[39] = __byte_perm (w[31], w[30], selector); + w[38] = __byte_perm (w[30], w[29], selector); + w[37] = __byte_perm (w[29], w[28], selector); + w[36] = __byte_perm (w[28], w[27], selector); + w[35] = __byte_perm (w[27], w[26], selector); + w[34] = __byte_perm (w[26], w[25], selector); + w[33] = __byte_perm (w[25], w[24], selector); + w[32] = __byte_perm (w[24], w[23], selector); + w[31] = __byte_perm (w[23], w[22], selector); + w[30] = __byte_perm (w[22], w[21], selector); + w[29] = __byte_perm (w[21], w[20], selector); + w[28] = __byte_perm (w[20], w[19], selector); + w[27] = __byte_perm (w[19], w[18], selector); + w[26] = __byte_perm (w[18], w[17], selector); + w[25] = __byte_perm (w[17], w[16], selector); + w[24] = __byte_perm (w[16], w[15], selector); + w[23] = __byte_perm (w[15], w[14], selector); + w[22] = __byte_perm (w[14], w[13], selector); + w[21] = __byte_perm (w[13], w[12], selector); + w[20] = __byte_perm (w[12], w[11], selector); + w[19] = __byte_perm (w[11], w[10], selector); + w[18] = __byte_perm (w[10], w[ 9], selector); + w[17] = __byte_perm (w[ 9], w[ 8], selector); + w[16] = __byte_perm (w[ 8], w[ 7], selector); + w[15] = __byte_perm (w[ 7], w[ 6], selector); + w[14] = __byte_perm (w[ 6], w[ 5], selector); + w[13] = __byte_perm (w[ 5], w[ 4], selector); + w[12] = __byte_perm (w[ 4], w[ 3], selector); + w[11] = __byte_perm (w[ 3], w[ 2], selector); + w[10] = __byte_perm (w[ 2], w[ 1], selector); + w[ 9] = __byte_perm (w[ 1], w[ 0], selector); + w[ 8] = __byte_perm (w[ 0], 0, selector); + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 9: + w[63] = __byte_perm (w[54], w[53], selector); + w[62] = __byte_perm (w[53], w[52], selector); + w[61] = __byte_perm (w[52], w[51], selector); + w[60] = __byte_perm (w[51], w[50], selector); + w[59] = __byte_perm (w[50], w[49], selector); + w[58] = __byte_perm (w[49], w[48], selector); + w[57] = __byte_perm (w[48], w[47], selector); + w[56] = __byte_perm (w[47], w[46], selector); + w[55] = __byte_perm (w[46], w[45], selector); + w[54] = __byte_perm (w[45], w[44], selector); + w[53] = __byte_perm (w[44], w[43], selector); + w[52] = __byte_perm (w[43], w[42], selector); + w[51] = __byte_perm (w[42], w[41], selector); + w[50] = __byte_perm (w[41], w[40], selector); + w[49] = __byte_perm (w[40], w[39], selector); + w[48] = __byte_perm (w[39], w[38], selector); + w[47] = __byte_perm (w[38], w[37], selector); + w[46] = __byte_perm (w[37], w[36], selector); + w[45] = __byte_perm (w[36], w[35], selector); + w[44] = __byte_perm (w[35], w[34], selector); + w[43] = __byte_perm (w[34], w[33], selector); + w[42] = __byte_perm (w[33], w[32], selector); + w[41] = __byte_perm (w[32], w[31], selector); + w[40] = __byte_perm (w[31], w[30], selector); + w[39] = __byte_perm (w[30], w[29], selector); + w[38] = __byte_perm (w[29], w[28], selector); + w[37] = __byte_perm (w[28], w[27], selector); + w[36] = __byte_perm (w[27], w[26], selector); + w[35] = __byte_perm (w[26], w[25], selector); + w[34] = __byte_perm (w[25], w[24], selector); + w[33] = __byte_perm (w[24], w[23], selector); + w[32] = __byte_perm (w[23], w[22], selector); + w[31] = __byte_perm (w[22], w[21], selector); + w[30] = __byte_perm (w[21], w[20], selector); + w[29] = __byte_perm (w[20], w[19], selector); + w[28] = __byte_perm (w[19], w[18], selector); + w[27] = __byte_perm (w[18], w[17], selector); + w[26] = __byte_perm (w[17], w[16], selector); + w[25] = __byte_perm (w[16], w[15], selector); + w[24] = __byte_perm (w[15], w[14], selector); + w[23] = __byte_perm (w[14], w[13], selector); + w[22] = __byte_perm (w[13], w[12], selector); + w[21] = __byte_perm (w[12], w[11], selector); + w[20] = __byte_perm (w[11], w[10], selector); + w[19] = __byte_perm (w[10], w[ 9], selector); + w[18] = __byte_perm (w[ 9], w[ 8], selector); + w[17] = __byte_perm (w[ 8], w[ 7], selector); + w[16] = __byte_perm (w[ 7], w[ 6], selector); + w[15] = __byte_perm (w[ 6], w[ 5], selector); + w[14] = __byte_perm (w[ 5], w[ 4], selector); + w[13] = __byte_perm (w[ 4], w[ 3], selector); + w[12] = __byte_perm (w[ 3], w[ 2], selector); + w[11] = __byte_perm (w[ 2], w[ 1], selector); + w[10] = __byte_perm (w[ 1], w[ 0], selector); + w[ 9] = __byte_perm (w[ 0], 0, selector); + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 10: + w[63] = __byte_perm (w[53], w[52], selector); + w[62] = __byte_perm (w[52], w[51], selector); + w[61] = __byte_perm (w[51], w[50], selector); + w[60] = __byte_perm (w[50], w[49], selector); + w[59] = __byte_perm (w[49], w[48], selector); + w[58] = __byte_perm (w[48], w[47], selector); + w[57] = __byte_perm (w[47], w[46], selector); + w[56] = __byte_perm (w[46], w[45], selector); + w[55] = __byte_perm (w[45], w[44], selector); + w[54] = __byte_perm (w[44], w[43], selector); + w[53] = __byte_perm (w[43], w[42], selector); + w[52] = __byte_perm (w[42], w[41], selector); + w[51] = __byte_perm (w[41], w[40], selector); + w[50] = __byte_perm (w[40], w[39], selector); + w[49] = __byte_perm (w[39], w[38], selector); + w[48] = __byte_perm (w[38], w[37], selector); + w[47] = __byte_perm (w[37], w[36], selector); + w[46] = __byte_perm (w[36], w[35], selector); + w[45] = __byte_perm (w[35], w[34], selector); + w[44] = __byte_perm (w[34], w[33], selector); + w[43] = __byte_perm (w[33], w[32], selector); + w[42] = __byte_perm (w[32], w[31], selector); + w[41] = __byte_perm (w[31], w[30], selector); + w[40] = __byte_perm (w[30], w[29], selector); + w[39] = __byte_perm (w[29], w[28], selector); + w[38] = __byte_perm (w[28], w[27], selector); + w[37] = __byte_perm (w[27], w[26], selector); + w[36] = __byte_perm (w[26], w[25], selector); + w[35] = __byte_perm (w[25], w[24], selector); + w[34] = __byte_perm (w[24], w[23], selector); + w[33] = __byte_perm (w[23], w[22], selector); + w[32] = __byte_perm (w[22], w[21], selector); + w[31] = __byte_perm (w[21], w[20], selector); + w[30] = __byte_perm (w[20], w[19], selector); + w[29] = __byte_perm (w[19], w[18], selector); + w[28] = __byte_perm (w[18], w[17], selector); + w[27] = __byte_perm (w[17], w[16], selector); + w[26] = __byte_perm (w[16], w[15], selector); + w[25] = __byte_perm (w[15], w[14], selector); + w[24] = __byte_perm (w[14], w[13], selector); + w[23] = __byte_perm (w[13], w[12], selector); + w[22] = __byte_perm (w[12], w[11], selector); + w[21] = __byte_perm (w[11], w[10], selector); + w[20] = __byte_perm (w[10], w[ 9], selector); + w[19] = __byte_perm (w[ 9], w[ 8], selector); + w[18] = __byte_perm (w[ 8], w[ 7], selector); + w[17] = __byte_perm (w[ 7], w[ 6], selector); + w[16] = __byte_perm (w[ 6], w[ 5], selector); + w[15] = __byte_perm (w[ 5], w[ 4], selector); + w[14] = __byte_perm (w[ 4], w[ 3], selector); + w[13] = __byte_perm (w[ 3], w[ 2], selector); + w[12] = __byte_perm (w[ 2], w[ 1], selector); + w[11] = __byte_perm (w[ 1], w[ 0], selector); + w[10] = __byte_perm (w[ 0], 0, selector); + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 11: + w[63] = __byte_perm (w[52], w[51], selector); + w[62] = __byte_perm (w[51], w[50], selector); + w[61] = __byte_perm (w[50], w[49], selector); + w[60] = __byte_perm (w[49], w[48], selector); + w[59] = __byte_perm (w[48], w[47], selector); + w[58] = __byte_perm (w[47], w[46], selector); + w[57] = __byte_perm (w[46], w[45], selector); + w[56] = __byte_perm (w[45], w[44], selector); + w[55] = __byte_perm (w[44], w[43], selector); + w[54] = __byte_perm (w[43], w[42], selector); + w[53] = __byte_perm (w[42], w[41], selector); + w[52] = __byte_perm (w[41], w[40], selector); + w[51] = __byte_perm (w[40], w[39], selector); + w[50] = __byte_perm (w[39], w[38], selector); + w[49] = __byte_perm (w[38], w[37], selector); + w[48] = __byte_perm (w[37], w[36], selector); + w[47] = __byte_perm (w[36], w[35], selector); + w[46] = __byte_perm (w[35], w[34], selector); + w[45] = __byte_perm (w[34], w[33], selector); + w[44] = __byte_perm (w[33], w[32], selector); + w[43] = __byte_perm (w[32], w[31], selector); + w[42] = __byte_perm (w[31], w[30], selector); + w[41] = __byte_perm (w[30], w[29], selector); + w[40] = __byte_perm (w[29], w[28], selector); + w[39] = __byte_perm (w[28], w[27], selector); + w[38] = __byte_perm (w[27], w[26], selector); + w[37] = __byte_perm (w[26], w[25], selector); + w[36] = __byte_perm (w[25], w[24], selector); + w[35] = __byte_perm (w[24], w[23], selector); + w[34] = __byte_perm (w[23], w[22], selector); + w[33] = __byte_perm (w[22], w[21], selector); + w[32] = __byte_perm (w[21], w[20], selector); + w[31] = __byte_perm (w[20], w[19], selector); + w[30] = __byte_perm (w[19], w[18], selector); + w[29] = __byte_perm (w[18], w[17], selector); + w[28] = __byte_perm (w[17], w[16], selector); + w[27] = __byte_perm (w[16], w[15], selector); + w[26] = __byte_perm (w[15], w[14], selector); + w[25] = __byte_perm (w[14], w[13], selector); + w[24] = __byte_perm (w[13], w[12], selector); + w[23] = __byte_perm (w[12], w[11], selector); + w[22] = __byte_perm (w[11], w[10], selector); + w[21] = __byte_perm (w[10], w[ 9], selector); + w[20] = __byte_perm (w[ 9], w[ 8], selector); + w[19] = __byte_perm (w[ 8], w[ 7], selector); + w[18] = __byte_perm (w[ 7], w[ 6], selector); + w[17] = __byte_perm (w[ 6], w[ 5], selector); + w[16] = __byte_perm (w[ 5], w[ 4], selector); + w[15] = __byte_perm (w[ 4], w[ 3], selector); + w[14] = __byte_perm (w[ 3], w[ 2], selector); + w[13] = __byte_perm (w[ 2], w[ 1], selector); + w[12] = __byte_perm (w[ 1], w[ 0], selector); + w[11] = __byte_perm (w[ 0], 0, selector); + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 12: + w[63] = __byte_perm (w[51], w[50], selector); + w[62] = __byte_perm (w[50], w[49], selector); + w[61] = __byte_perm (w[49], w[48], selector); + w[60] = __byte_perm (w[48], w[47], selector); + w[59] = __byte_perm (w[47], w[46], selector); + w[58] = __byte_perm (w[46], w[45], selector); + w[57] = __byte_perm (w[45], w[44], selector); + w[56] = __byte_perm (w[44], w[43], selector); + w[55] = __byte_perm (w[43], w[42], selector); + w[54] = __byte_perm (w[42], w[41], selector); + w[53] = __byte_perm (w[41], w[40], selector); + w[52] = __byte_perm (w[40], w[39], selector); + w[51] = __byte_perm (w[39], w[38], selector); + w[50] = __byte_perm (w[38], w[37], selector); + w[49] = __byte_perm (w[37], w[36], selector); + w[48] = __byte_perm (w[36], w[35], selector); + w[47] = __byte_perm (w[35], w[34], selector); + w[46] = __byte_perm (w[34], w[33], selector); + w[45] = __byte_perm (w[33], w[32], selector); + w[44] = __byte_perm (w[32], w[31], selector); + w[43] = __byte_perm (w[31], w[30], selector); + w[42] = __byte_perm (w[30], w[29], selector); + w[41] = __byte_perm (w[29], w[28], selector); + w[40] = __byte_perm (w[28], w[27], selector); + w[39] = __byte_perm (w[27], w[26], selector); + w[38] = __byte_perm (w[26], w[25], selector); + w[37] = __byte_perm (w[25], w[24], selector); + w[36] = __byte_perm (w[24], w[23], selector); + w[35] = __byte_perm (w[23], w[22], selector); + w[34] = __byte_perm (w[22], w[21], selector); + w[33] = __byte_perm (w[21], w[20], selector); + w[32] = __byte_perm (w[20], w[19], selector); + w[31] = __byte_perm (w[19], w[18], selector); + w[30] = __byte_perm (w[18], w[17], selector); + w[29] = __byte_perm (w[17], w[16], selector); + w[28] = __byte_perm (w[16], w[15], selector); + w[27] = __byte_perm (w[15], w[14], selector); + w[26] = __byte_perm (w[14], w[13], selector); + w[25] = __byte_perm (w[13], w[12], selector); + w[24] = __byte_perm (w[12], w[11], selector); + w[23] = __byte_perm (w[11], w[10], selector); + w[22] = __byte_perm (w[10], w[ 9], selector); + w[21] = __byte_perm (w[ 9], w[ 8], selector); + w[20] = __byte_perm (w[ 8], w[ 7], selector); + w[19] = __byte_perm (w[ 7], w[ 6], selector); + w[18] = __byte_perm (w[ 6], w[ 5], selector); + w[17] = __byte_perm (w[ 5], w[ 4], selector); + w[16] = __byte_perm (w[ 4], w[ 3], selector); + w[15] = __byte_perm (w[ 3], w[ 2], selector); + w[14] = __byte_perm (w[ 2], w[ 1], selector); + w[13] = __byte_perm (w[ 1], w[ 0], selector); + w[12] = __byte_perm (w[ 0], 0, selector); + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 13: + w[63] = __byte_perm (w[50], w[49], selector); + w[62] = __byte_perm (w[49], w[48], selector); + w[61] = __byte_perm (w[48], w[47], selector); + w[60] = __byte_perm (w[47], w[46], selector); + w[59] = __byte_perm (w[46], w[45], selector); + w[58] = __byte_perm (w[45], w[44], selector); + w[57] = __byte_perm (w[44], w[43], selector); + w[56] = __byte_perm (w[43], w[42], selector); + w[55] = __byte_perm (w[42], w[41], selector); + w[54] = __byte_perm (w[41], w[40], selector); + w[53] = __byte_perm (w[40], w[39], selector); + w[52] = __byte_perm (w[39], w[38], selector); + w[51] = __byte_perm (w[38], w[37], selector); + w[50] = __byte_perm (w[37], w[36], selector); + w[49] = __byte_perm (w[36], w[35], selector); + w[48] = __byte_perm (w[35], w[34], selector); + w[47] = __byte_perm (w[34], w[33], selector); + w[46] = __byte_perm (w[33], w[32], selector); + w[45] = __byte_perm (w[32], w[31], selector); + w[44] = __byte_perm (w[31], w[30], selector); + w[43] = __byte_perm (w[30], w[29], selector); + w[42] = __byte_perm (w[29], w[28], selector); + w[41] = __byte_perm (w[28], w[27], selector); + w[40] = __byte_perm (w[27], w[26], selector); + w[39] = __byte_perm (w[26], w[25], selector); + w[38] = __byte_perm (w[25], w[24], selector); + w[37] = __byte_perm (w[24], w[23], selector); + w[36] = __byte_perm (w[23], w[22], selector); + w[35] = __byte_perm (w[22], w[21], selector); + w[34] = __byte_perm (w[21], w[20], selector); + w[33] = __byte_perm (w[20], w[19], selector); + w[32] = __byte_perm (w[19], w[18], selector); + w[31] = __byte_perm (w[18], w[17], selector); + w[30] = __byte_perm (w[17], w[16], selector); + w[29] = __byte_perm (w[16], w[15], selector); + w[28] = __byte_perm (w[15], w[14], selector); + w[27] = __byte_perm (w[14], w[13], selector); + w[26] = __byte_perm (w[13], w[12], selector); + w[25] = __byte_perm (w[12], w[11], selector); + w[24] = __byte_perm (w[11], w[10], selector); + w[23] = __byte_perm (w[10], w[ 9], selector); + w[22] = __byte_perm (w[ 9], w[ 8], selector); + w[21] = __byte_perm (w[ 8], w[ 7], selector); + w[20] = __byte_perm (w[ 7], w[ 6], selector); + w[19] = __byte_perm (w[ 6], w[ 5], selector); + w[18] = __byte_perm (w[ 5], w[ 4], selector); + w[17] = __byte_perm (w[ 4], w[ 3], selector); + w[16] = __byte_perm (w[ 3], w[ 2], selector); + w[15] = __byte_perm (w[ 2], w[ 1], selector); + w[14] = __byte_perm (w[ 1], w[ 0], selector); + w[13] = __byte_perm (w[ 0], 0, selector); + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 14: + w[63] = __byte_perm (w[49], w[48], selector); + w[62] = __byte_perm (w[48], w[47], selector); + w[61] = __byte_perm (w[47], w[46], selector); + w[60] = __byte_perm (w[46], w[45], selector); + w[59] = __byte_perm (w[45], w[44], selector); + w[58] = __byte_perm (w[44], w[43], selector); + w[57] = __byte_perm (w[43], w[42], selector); + w[56] = __byte_perm (w[42], w[41], selector); + w[55] = __byte_perm (w[41], w[40], selector); + w[54] = __byte_perm (w[40], w[39], selector); + w[53] = __byte_perm (w[39], w[38], selector); + w[52] = __byte_perm (w[38], w[37], selector); + w[51] = __byte_perm (w[37], w[36], selector); + w[50] = __byte_perm (w[36], w[35], selector); + w[49] = __byte_perm (w[35], w[34], selector); + w[48] = __byte_perm (w[34], w[33], selector); + w[47] = __byte_perm (w[33], w[32], selector); + w[46] = __byte_perm (w[32], w[31], selector); + w[45] = __byte_perm (w[31], w[30], selector); + w[44] = __byte_perm (w[30], w[29], selector); + w[43] = __byte_perm (w[29], w[28], selector); + w[42] = __byte_perm (w[28], w[27], selector); + w[41] = __byte_perm (w[27], w[26], selector); + w[40] = __byte_perm (w[26], w[25], selector); + w[39] = __byte_perm (w[25], w[24], selector); + w[38] = __byte_perm (w[24], w[23], selector); + w[37] = __byte_perm (w[23], w[22], selector); + w[36] = __byte_perm (w[22], w[21], selector); + w[35] = __byte_perm (w[21], w[20], selector); + w[34] = __byte_perm (w[20], w[19], selector); + w[33] = __byte_perm (w[19], w[18], selector); + w[32] = __byte_perm (w[18], w[17], selector); + w[31] = __byte_perm (w[17], w[16], selector); + w[30] = __byte_perm (w[16], w[15], selector); + w[29] = __byte_perm (w[15], w[14], selector); + w[28] = __byte_perm (w[14], w[13], selector); + w[27] = __byte_perm (w[13], w[12], selector); + w[26] = __byte_perm (w[12], w[11], selector); + w[25] = __byte_perm (w[11], w[10], selector); + w[24] = __byte_perm (w[10], w[ 9], selector); + w[23] = __byte_perm (w[ 9], w[ 8], selector); + w[22] = __byte_perm (w[ 8], w[ 7], selector); + w[21] = __byte_perm (w[ 7], w[ 6], selector); + w[20] = __byte_perm (w[ 6], w[ 5], selector); + w[19] = __byte_perm (w[ 5], w[ 4], selector); + w[18] = __byte_perm (w[ 4], w[ 3], selector); + w[17] = __byte_perm (w[ 3], w[ 2], selector); + w[16] = __byte_perm (w[ 2], w[ 1], selector); + w[15] = __byte_perm (w[ 1], w[ 0], selector); + w[14] = __byte_perm (w[ 0], 0, selector); + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 15: + w[63] = __byte_perm (w[48], w[47], selector); + w[62] = __byte_perm (w[47], w[46], selector); + w[61] = __byte_perm (w[46], w[45], selector); + w[60] = __byte_perm (w[45], w[44], selector); + w[59] = __byte_perm (w[44], w[43], selector); + w[58] = __byte_perm (w[43], w[42], selector); + w[57] = __byte_perm (w[42], w[41], selector); + w[56] = __byte_perm (w[41], w[40], selector); + w[55] = __byte_perm (w[40], w[39], selector); + w[54] = __byte_perm (w[39], w[38], selector); + w[53] = __byte_perm (w[38], w[37], selector); + w[52] = __byte_perm (w[37], w[36], selector); + w[51] = __byte_perm (w[36], w[35], selector); + w[50] = __byte_perm (w[35], w[34], selector); + w[49] = __byte_perm (w[34], w[33], selector); + w[48] = __byte_perm (w[33], w[32], selector); + w[47] = __byte_perm (w[32], w[31], selector); + w[46] = __byte_perm (w[31], w[30], selector); + w[45] = __byte_perm (w[30], w[29], selector); + w[44] = __byte_perm (w[29], w[28], selector); + w[43] = __byte_perm (w[28], w[27], selector); + w[42] = __byte_perm (w[27], w[26], selector); + w[41] = __byte_perm (w[26], w[25], selector); + w[40] = __byte_perm (w[25], w[24], selector); + w[39] = __byte_perm (w[24], w[23], selector); + w[38] = __byte_perm (w[23], w[22], selector); + w[37] = __byte_perm (w[22], w[21], selector); + w[36] = __byte_perm (w[21], w[20], selector); + w[35] = __byte_perm (w[20], w[19], selector); + w[34] = __byte_perm (w[19], w[18], selector); + w[33] = __byte_perm (w[18], w[17], selector); + w[32] = __byte_perm (w[17], w[16], selector); + w[31] = __byte_perm (w[16], w[15], selector); + w[30] = __byte_perm (w[15], w[14], selector); + w[29] = __byte_perm (w[14], w[13], selector); + w[28] = __byte_perm (w[13], w[12], selector); + w[27] = __byte_perm (w[12], w[11], selector); + w[26] = __byte_perm (w[11], w[10], selector); + w[25] = __byte_perm (w[10], w[ 9], selector); + w[24] = __byte_perm (w[ 9], w[ 8], selector); + w[23] = __byte_perm (w[ 8], w[ 7], selector); + w[22] = __byte_perm (w[ 7], w[ 6], selector); + w[21] = __byte_perm (w[ 6], w[ 5], selector); + w[20] = __byte_perm (w[ 5], w[ 4], selector); + w[19] = __byte_perm (w[ 4], w[ 3], selector); + w[18] = __byte_perm (w[ 3], w[ 2], selector); + w[17] = __byte_perm (w[ 2], w[ 1], selector); + w[16] = __byte_perm (w[ 1], w[ 0], selector); + w[15] = __byte_perm (w[ 0], 0, selector); + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 16: + w[63] = __byte_perm (w[47], w[46], selector); + w[62] = __byte_perm (w[46], w[45], selector); + w[61] = __byte_perm (w[45], w[44], selector); + w[60] = __byte_perm (w[44], w[43], selector); + w[59] = __byte_perm (w[43], w[42], selector); + w[58] = __byte_perm (w[42], w[41], selector); + w[57] = __byte_perm (w[41], w[40], selector); + w[56] = __byte_perm (w[40], w[39], selector); + w[55] = __byte_perm (w[39], w[38], selector); + w[54] = __byte_perm (w[38], w[37], selector); + w[53] = __byte_perm (w[37], w[36], selector); + w[52] = __byte_perm (w[36], w[35], selector); + w[51] = __byte_perm (w[35], w[34], selector); + w[50] = __byte_perm (w[34], w[33], selector); + w[49] = __byte_perm (w[33], w[32], selector); + w[48] = __byte_perm (w[32], w[31], selector); + w[47] = __byte_perm (w[31], w[30], selector); + w[46] = __byte_perm (w[30], w[29], selector); + w[45] = __byte_perm (w[29], w[28], selector); + w[44] = __byte_perm (w[28], w[27], selector); + w[43] = __byte_perm (w[27], w[26], selector); + w[42] = __byte_perm (w[26], w[25], selector); + w[41] = __byte_perm (w[25], w[24], selector); + w[40] = __byte_perm (w[24], w[23], selector); + w[39] = __byte_perm (w[23], w[22], selector); + w[38] = __byte_perm (w[22], w[21], selector); + w[37] = __byte_perm (w[21], w[20], selector); + w[36] = __byte_perm (w[20], w[19], selector); + w[35] = __byte_perm (w[19], w[18], selector); + w[34] = __byte_perm (w[18], w[17], selector); + w[33] = __byte_perm (w[17], w[16], selector); + w[32] = __byte_perm (w[16], w[15], selector); + w[31] = __byte_perm (w[15], w[14], selector); + w[30] = __byte_perm (w[14], w[13], selector); + w[29] = __byte_perm (w[13], w[12], selector); + w[28] = __byte_perm (w[12], w[11], selector); + w[27] = __byte_perm (w[11], w[10], selector); + w[26] = __byte_perm (w[10], w[ 9], selector); + w[25] = __byte_perm (w[ 9], w[ 8], selector); + w[24] = __byte_perm (w[ 8], w[ 7], selector); + w[23] = __byte_perm (w[ 7], w[ 6], selector); + w[22] = __byte_perm (w[ 6], w[ 5], selector); + w[21] = __byte_perm (w[ 5], w[ 4], selector); + w[20] = __byte_perm (w[ 4], w[ 3], selector); + w[19] = __byte_perm (w[ 3], w[ 2], selector); + w[18] = __byte_perm (w[ 2], w[ 1], selector); + w[17] = __byte_perm (w[ 1], w[ 0], selector); + w[16] = __byte_perm (w[ 0], 0, selector); + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 17: + w[63] = __byte_perm (w[46], w[45], selector); + w[62] = __byte_perm (w[45], w[44], selector); + w[61] = __byte_perm (w[44], w[43], selector); + w[60] = __byte_perm (w[43], w[42], selector); + w[59] = __byte_perm (w[42], w[41], selector); + w[58] = __byte_perm (w[41], w[40], selector); + w[57] = __byte_perm (w[40], w[39], selector); + w[56] = __byte_perm (w[39], w[38], selector); + w[55] = __byte_perm (w[38], w[37], selector); + w[54] = __byte_perm (w[37], w[36], selector); + w[53] = __byte_perm (w[36], w[35], selector); + w[52] = __byte_perm (w[35], w[34], selector); + w[51] = __byte_perm (w[34], w[33], selector); + w[50] = __byte_perm (w[33], w[32], selector); + w[49] = __byte_perm (w[32], w[31], selector); + w[48] = __byte_perm (w[31], w[30], selector); + w[47] = __byte_perm (w[30], w[29], selector); + w[46] = __byte_perm (w[29], w[28], selector); + w[45] = __byte_perm (w[28], w[27], selector); + w[44] = __byte_perm (w[27], w[26], selector); + w[43] = __byte_perm (w[26], w[25], selector); + w[42] = __byte_perm (w[25], w[24], selector); + w[41] = __byte_perm (w[24], w[23], selector); + w[40] = __byte_perm (w[23], w[22], selector); + w[39] = __byte_perm (w[22], w[21], selector); + w[38] = __byte_perm (w[21], w[20], selector); + w[37] = __byte_perm (w[20], w[19], selector); + w[36] = __byte_perm (w[19], w[18], selector); + w[35] = __byte_perm (w[18], w[17], selector); + w[34] = __byte_perm (w[17], w[16], selector); + w[33] = __byte_perm (w[16], w[15], selector); + w[32] = __byte_perm (w[15], w[14], selector); + w[31] = __byte_perm (w[14], w[13], selector); + w[30] = __byte_perm (w[13], w[12], selector); + w[29] = __byte_perm (w[12], w[11], selector); + w[28] = __byte_perm (w[11], w[10], selector); + w[27] = __byte_perm (w[10], w[ 9], selector); + w[26] = __byte_perm (w[ 9], w[ 8], selector); + w[25] = __byte_perm (w[ 8], w[ 7], selector); + w[24] = __byte_perm (w[ 7], w[ 6], selector); + w[23] = __byte_perm (w[ 6], w[ 5], selector); + w[22] = __byte_perm (w[ 5], w[ 4], selector); + w[21] = __byte_perm (w[ 4], w[ 3], selector); + w[20] = __byte_perm (w[ 3], w[ 2], selector); + w[19] = __byte_perm (w[ 2], w[ 1], selector); + w[18] = __byte_perm (w[ 1], w[ 0], selector); + w[17] = __byte_perm (w[ 0], 0, selector); + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 18: + w[63] = __byte_perm (w[45], w[44], selector); + w[62] = __byte_perm (w[44], w[43], selector); + w[61] = __byte_perm (w[43], w[42], selector); + w[60] = __byte_perm (w[42], w[41], selector); + w[59] = __byte_perm (w[41], w[40], selector); + w[58] = __byte_perm (w[40], w[39], selector); + w[57] = __byte_perm (w[39], w[38], selector); + w[56] = __byte_perm (w[38], w[37], selector); + w[55] = __byte_perm (w[37], w[36], selector); + w[54] = __byte_perm (w[36], w[35], selector); + w[53] = __byte_perm (w[35], w[34], selector); + w[52] = __byte_perm (w[34], w[33], selector); + w[51] = __byte_perm (w[33], w[32], selector); + w[50] = __byte_perm (w[32], w[31], selector); + w[49] = __byte_perm (w[31], w[30], selector); + w[48] = __byte_perm (w[30], w[29], selector); + w[47] = __byte_perm (w[29], w[28], selector); + w[46] = __byte_perm (w[28], w[27], selector); + w[45] = __byte_perm (w[27], w[26], selector); + w[44] = __byte_perm (w[26], w[25], selector); + w[43] = __byte_perm (w[25], w[24], selector); + w[42] = __byte_perm (w[24], w[23], selector); + w[41] = __byte_perm (w[23], w[22], selector); + w[40] = __byte_perm (w[22], w[21], selector); + w[39] = __byte_perm (w[21], w[20], selector); + w[38] = __byte_perm (w[20], w[19], selector); + w[37] = __byte_perm (w[19], w[18], selector); + w[36] = __byte_perm (w[18], w[17], selector); + w[35] = __byte_perm (w[17], w[16], selector); + w[34] = __byte_perm (w[16], w[15], selector); + w[33] = __byte_perm (w[15], w[14], selector); + w[32] = __byte_perm (w[14], w[13], selector); + w[31] = __byte_perm (w[13], w[12], selector); + w[30] = __byte_perm (w[12], w[11], selector); + w[29] = __byte_perm (w[11], w[10], selector); + w[28] = __byte_perm (w[10], w[ 9], selector); + w[27] = __byte_perm (w[ 9], w[ 8], selector); + w[26] = __byte_perm (w[ 8], w[ 7], selector); + w[25] = __byte_perm (w[ 7], w[ 6], selector); + w[24] = __byte_perm (w[ 6], w[ 5], selector); + w[23] = __byte_perm (w[ 5], w[ 4], selector); + w[22] = __byte_perm (w[ 4], w[ 3], selector); + w[21] = __byte_perm (w[ 3], w[ 2], selector); + w[20] = __byte_perm (w[ 2], w[ 1], selector); + w[19] = __byte_perm (w[ 1], w[ 0], selector); + w[18] = __byte_perm (w[ 0], 0, selector); + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 19: + w[63] = __byte_perm (w[44], w[43], selector); + w[62] = __byte_perm (w[43], w[42], selector); + w[61] = __byte_perm (w[42], w[41], selector); + w[60] = __byte_perm (w[41], w[40], selector); + w[59] = __byte_perm (w[40], w[39], selector); + w[58] = __byte_perm (w[39], w[38], selector); + w[57] = __byte_perm (w[38], w[37], selector); + w[56] = __byte_perm (w[37], w[36], selector); + w[55] = __byte_perm (w[36], w[35], selector); + w[54] = __byte_perm (w[35], w[34], selector); + w[53] = __byte_perm (w[34], w[33], selector); + w[52] = __byte_perm (w[33], w[32], selector); + w[51] = __byte_perm (w[32], w[31], selector); + w[50] = __byte_perm (w[31], w[30], selector); + w[49] = __byte_perm (w[30], w[29], selector); + w[48] = __byte_perm (w[29], w[28], selector); + w[47] = __byte_perm (w[28], w[27], selector); + w[46] = __byte_perm (w[27], w[26], selector); + w[45] = __byte_perm (w[26], w[25], selector); + w[44] = __byte_perm (w[25], w[24], selector); + w[43] = __byte_perm (w[24], w[23], selector); + w[42] = __byte_perm (w[23], w[22], selector); + w[41] = __byte_perm (w[22], w[21], selector); + w[40] = __byte_perm (w[21], w[20], selector); + w[39] = __byte_perm (w[20], w[19], selector); + w[38] = __byte_perm (w[19], w[18], selector); + w[37] = __byte_perm (w[18], w[17], selector); + w[36] = __byte_perm (w[17], w[16], selector); + w[35] = __byte_perm (w[16], w[15], selector); + w[34] = __byte_perm (w[15], w[14], selector); + w[33] = __byte_perm (w[14], w[13], selector); + w[32] = __byte_perm (w[13], w[12], selector); + w[31] = __byte_perm (w[12], w[11], selector); + w[30] = __byte_perm (w[11], w[10], selector); + w[29] = __byte_perm (w[10], w[ 9], selector); + w[28] = __byte_perm (w[ 9], w[ 8], selector); + w[27] = __byte_perm (w[ 8], w[ 7], selector); + w[26] = __byte_perm (w[ 7], w[ 6], selector); + w[25] = __byte_perm (w[ 6], w[ 5], selector); + w[24] = __byte_perm (w[ 5], w[ 4], selector); + w[23] = __byte_perm (w[ 4], w[ 3], selector); + w[22] = __byte_perm (w[ 3], w[ 2], selector); + w[21] = __byte_perm (w[ 2], w[ 1], selector); + w[20] = __byte_perm (w[ 1], w[ 0], selector); + w[19] = __byte_perm (w[ 0], 0, selector); + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 20: + w[63] = __byte_perm (w[43], w[42], selector); + w[62] = __byte_perm (w[42], w[41], selector); + w[61] = __byte_perm (w[41], w[40], selector); + w[60] = __byte_perm (w[40], w[39], selector); + w[59] = __byte_perm (w[39], w[38], selector); + w[58] = __byte_perm (w[38], w[37], selector); + w[57] = __byte_perm (w[37], w[36], selector); + w[56] = __byte_perm (w[36], w[35], selector); + w[55] = __byte_perm (w[35], w[34], selector); + w[54] = __byte_perm (w[34], w[33], selector); + w[53] = __byte_perm (w[33], w[32], selector); + w[52] = __byte_perm (w[32], w[31], selector); + w[51] = __byte_perm (w[31], w[30], selector); + w[50] = __byte_perm (w[30], w[29], selector); + w[49] = __byte_perm (w[29], w[28], selector); + w[48] = __byte_perm (w[28], w[27], selector); + w[47] = __byte_perm (w[27], w[26], selector); + w[46] = __byte_perm (w[26], w[25], selector); + w[45] = __byte_perm (w[25], w[24], selector); + w[44] = __byte_perm (w[24], w[23], selector); + w[43] = __byte_perm (w[23], w[22], selector); + w[42] = __byte_perm (w[22], w[21], selector); + w[41] = __byte_perm (w[21], w[20], selector); + w[40] = __byte_perm (w[20], w[19], selector); + w[39] = __byte_perm (w[19], w[18], selector); + w[38] = __byte_perm (w[18], w[17], selector); + w[37] = __byte_perm (w[17], w[16], selector); + w[36] = __byte_perm (w[16], w[15], selector); + w[35] = __byte_perm (w[15], w[14], selector); + w[34] = __byte_perm (w[14], w[13], selector); + w[33] = __byte_perm (w[13], w[12], selector); + w[32] = __byte_perm (w[12], w[11], selector); + w[31] = __byte_perm (w[11], w[10], selector); + w[30] = __byte_perm (w[10], w[ 9], selector); + w[29] = __byte_perm (w[ 9], w[ 8], selector); + w[28] = __byte_perm (w[ 8], w[ 7], selector); + w[27] = __byte_perm (w[ 7], w[ 6], selector); + w[26] = __byte_perm (w[ 6], w[ 5], selector); + w[25] = __byte_perm (w[ 5], w[ 4], selector); + w[24] = __byte_perm (w[ 4], w[ 3], selector); + w[23] = __byte_perm (w[ 3], w[ 2], selector); + w[22] = __byte_perm (w[ 2], w[ 1], selector); + w[21] = __byte_perm (w[ 1], w[ 0], selector); + w[20] = __byte_perm (w[ 0], 0, selector); + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 21: + w[63] = __byte_perm (w[42], w[41], selector); + w[62] = __byte_perm (w[41], w[40], selector); + w[61] = __byte_perm (w[40], w[39], selector); + w[60] = __byte_perm (w[39], w[38], selector); + w[59] = __byte_perm (w[38], w[37], selector); + w[58] = __byte_perm (w[37], w[36], selector); + w[57] = __byte_perm (w[36], w[35], selector); + w[56] = __byte_perm (w[35], w[34], selector); + w[55] = __byte_perm (w[34], w[33], selector); + w[54] = __byte_perm (w[33], w[32], selector); + w[53] = __byte_perm (w[32], w[31], selector); + w[52] = __byte_perm (w[31], w[30], selector); + w[51] = __byte_perm (w[30], w[29], selector); + w[50] = __byte_perm (w[29], w[28], selector); + w[49] = __byte_perm (w[28], w[27], selector); + w[48] = __byte_perm (w[27], w[26], selector); + w[47] = __byte_perm (w[26], w[25], selector); + w[46] = __byte_perm (w[25], w[24], selector); + w[45] = __byte_perm (w[24], w[23], selector); + w[44] = __byte_perm (w[23], w[22], selector); + w[43] = __byte_perm (w[22], w[21], selector); + w[42] = __byte_perm (w[21], w[20], selector); + w[41] = __byte_perm (w[20], w[19], selector); + w[40] = __byte_perm (w[19], w[18], selector); + w[39] = __byte_perm (w[18], w[17], selector); + w[38] = __byte_perm (w[17], w[16], selector); + w[37] = __byte_perm (w[16], w[15], selector); + w[36] = __byte_perm (w[15], w[14], selector); + w[35] = __byte_perm (w[14], w[13], selector); + w[34] = __byte_perm (w[13], w[12], selector); + w[33] = __byte_perm (w[12], w[11], selector); + w[32] = __byte_perm (w[11], w[10], selector); + w[31] = __byte_perm (w[10], w[ 9], selector); + w[30] = __byte_perm (w[ 9], w[ 8], selector); + w[29] = __byte_perm (w[ 8], w[ 7], selector); + w[28] = __byte_perm (w[ 7], w[ 6], selector); + w[27] = __byte_perm (w[ 6], w[ 5], selector); + w[26] = __byte_perm (w[ 5], w[ 4], selector); + w[25] = __byte_perm (w[ 4], w[ 3], selector); + w[24] = __byte_perm (w[ 3], w[ 2], selector); + w[23] = __byte_perm (w[ 2], w[ 1], selector); + w[22] = __byte_perm (w[ 1], w[ 0], selector); + w[21] = __byte_perm (w[ 0], 0, selector); + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 22: + w[63] = __byte_perm (w[41], w[40], selector); + w[62] = __byte_perm (w[40], w[39], selector); + w[61] = __byte_perm (w[39], w[38], selector); + w[60] = __byte_perm (w[38], w[37], selector); + w[59] = __byte_perm (w[37], w[36], selector); + w[58] = __byte_perm (w[36], w[35], selector); + w[57] = __byte_perm (w[35], w[34], selector); + w[56] = __byte_perm (w[34], w[33], selector); + w[55] = __byte_perm (w[33], w[32], selector); + w[54] = __byte_perm (w[32], w[31], selector); + w[53] = __byte_perm (w[31], w[30], selector); + w[52] = __byte_perm (w[30], w[29], selector); + w[51] = __byte_perm (w[29], w[28], selector); + w[50] = __byte_perm (w[28], w[27], selector); + w[49] = __byte_perm (w[27], w[26], selector); + w[48] = __byte_perm (w[26], w[25], selector); + w[47] = __byte_perm (w[25], w[24], selector); + w[46] = __byte_perm (w[24], w[23], selector); + w[45] = __byte_perm (w[23], w[22], selector); + w[44] = __byte_perm (w[22], w[21], selector); + w[43] = __byte_perm (w[21], w[20], selector); + w[42] = __byte_perm (w[20], w[19], selector); + w[41] = __byte_perm (w[19], w[18], selector); + w[40] = __byte_perm (w[18], w[17], selector); + w[39] = __byte_perm (w[17], w[16], selector); + w[38] = __byte_perm (w[16], w[15], selector); + w[37] = __byte_perm (w[15], w[14], selector); + w[36] = __byte_perm (w[14], w[13], selector); + w[35] = __byte_perm (w[13], w[12], selector); + w[34] = __byte_perm (w[12], w[11], selector); + w[33] = __byte_perm (w[11], w[10], selector); + w[32] = __byte_perm (w[10], w[ 9], selector); + w[31] = __byte_perm (w[ 9], w[ 8], selector); + w[30] = __byte_perm (w[ 8], w[ 7], selector); + w[29] = __byte_perm (w[ 7], w[ 6], selector); + w[28] = __byte_perm (w[ 6], w[ 5], selector); + w[27] = __byte_perm (w[ 5], w[ 4], selector); + w[26] = __byte_perm (w[ 4], w[ 3], selector); + w[25] = __byte_perm (w[ 3], w[ 2], selector); + w[24] = __byte_perm (w[ 2], w[ 1], selector); + w[23] = __byte_perm (w[ 1], w[ 0], selector); + w[22] = __byte_perm (w[ 0], 0, selector); + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 23: + w[63] = __byte_perm (w[40], w[39], selector); + w[62] = __byte_perm (w[39], w[38], selector); + w[61] = __byte_perm (w[38], w[37], selector); + w[60] = __byte_perm (w[37], w[36], selector); + w[59] = __byte_perm (w[36], w[35], selector); + w[58] = __byte_perm (w[35], w[34], selector); + w[57] = __byte_perm (w[34], w[33], selector); + w[56] = __byte_perm (w[33], w[32], selector); + w[55] = __byte_perm (w[32], w[31], selector); + w[54] = __byte_perm (w[31], w[30], selector); + w[53] = __byte_perm (w[30], w[29], selector); + w[52] = __byte_perm (w[29], w[28], selector); + w[51] = __byte_perm (w[28], w[27], selector); + w[50] = __byte_perm (w[27], w[26], selector); + w[49] = __byte_perm (w[26], w[25], selector); + w[48] = __byte_perm (w[25], w[24], selector); + w[47] = __byte_perm (w[24], w[23], selector); + w[46] = __byte_perm (w[23], w[22], selector); + w[45] = __byte_perm (w[22], w[21], selector); + w[44] = __byte_perm (w[21], w[20], selector); + w[43] = __byte_perm (w[20], w[19], selector); + w[42] = __byte_perm (w[19], w[18], selector); + w[41] = __byte_perm (w[18], w[17], selector); + w[40] = __byte_perm (w[17], w[16], selector); + w[39] = __byte_perm (w[16], w[15], selector); + w[38] = __byte_perm (w[15], w[14], selector); + w[37] = __byte_perm (w[14], w[13], selector); + w[36] = __byte_perm (w[13], w[12], selector); + w[35] = __byte_perm (w[12], w[11], selector); + w[34] = __byte_perm (w[11], w[10], selector); + w[33] = __byte_perm (w[10], w[ 9], selector); + w[32] = __byte_perm (w[ 9], w[ 8], selector); + w[31] = __byte_perm (w[ 8], w[ 7], selector); + w[30] = __byte_perm (w[ 7], w[ 6], selector); + w[29] = __byte_perm (w[ 6], w[ 5], selector); + w[28] = __byte_perm (w[ 5], w[ 4], selector); + w[27] = __byte_perm (w[ 4], w[ 3], selector); + w[26] = __byte_perm (w[ 3], w[ 2], selector); + w[25] = __byte_perm (w[ 2], w[ 1], selector); + w[24] = __byte_perm (w[ 1], w[ 0], selector); + w[23] = __byte_perm (w[ 0], 0, selector); + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 24: + w[63] = __byte_perm (w[39], w[38], selector); + w[62] = __byte_perm (w[38], w[37], selector); + w[61] = __byte_perm (w[37], w[36], selector); + w[60] = __byte_perm (w[36], w[35], selector); + w[59] = __byte_perm (w[35], w[34], selector); + w[58] = __byte_perm (w[34], w[33], selector); + w[57] = __byte_perm (w[33], w[32], selector); + w[56] = __byte_perm (w[32], w[31], selector); + w[55] = __byte_perm (w[31], w[30], selector); + w[54] = __byte_perm (w[30], w[29], selector); + w[53] = __byte_perm (w[29], w[28], selector); + w[52] = __byte_perm (w[28], w[27], selector); + w[51] = __byte_perm (w[27], w[26], selector); + w[50] = __byte_perm (w[26], w[25], selector); + w[49] = __byte_perm (w[25], w[24], selector); + w[48] = __byte_perm (w[24], w[23], selector); + w[47] = __byte_perm (w[23], w[22], selector); + w[46] = __byte_perm (w[22], w[21], selector); + w[45] = __byte_perm (w[21], w[20], selector); + w[44] = __byte_perm (w[20], w[19], selector); + w[43] = __byte_perm (w[19], w[18], selector); + w[42] = __byte_perm (w[18], w[17], selector); + w[41] = __byte_perm (w[17], w[16], selector); + w[40] = __byte_perm (w[16], w[15], selector); + w[39] = __byte_perm (w[15], w[14], selector); + w[38] = __byte_perm (w[14], w[13], selector); + w[37] = __byte_perm (w[13], w[12], selector); + w[36] = __byte_perm (w[12], w[11], selector); + w[35] = __byte_perm (w[11], w[10], selector); + w[34] = __byte_perm (w[10], w[ 9], selector); + w[33] = __byte_perm (w[ 9], w[ 8], selector); + w[32] = __byte_perm (w[ 8], w[ 7], selector); + w[31] = __byte_perm (w[ 7], w[ 6], selector); + w[30] = __byte_perm (w[ 6], w[ 5], selector); + w[29] = __byte_perm (w[ 5], w[ 4], selector); + w[28] = __byte_perm (w[ 4], w[ 3], selector); + w[27] = __byte_perm (w[ 3], w[ 2], selector); + w[26] = __byte_perm (w[ 2], w[ 1], selector); + w[25] = __byte_perm (w[ 1], w[ 0], selector); + w[24] = __byte_perm (w[ 0], 0, selector); + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 25: + w[63] = __byte_perm (w[38], w[37], selector); + w[62] = __byte_perm (w[37], w[36], selector); + w[61] = __byte_perm (w[36], w[35], selector); + w[60] = __byte_perm (w[35], w[34], selector); + w[59] = __byte_perm (w[34], w[33], selector); + w[58] = __byte_perm (w[33], w[32], selector); + w[57] = __byte_perm (w[32], w[31], selector); + w[56] = __byte_perm (w[31], w[30], selector); + w[55] = __byte_perm (w[30], w[29], selector); + w[54] = __byte_perm (w[29], w[28], selector); + w[53] = __byte_perm (w[28], w[27], selector); + w[52] = __byte_perm (w[27], w[26], selector); + w[51] = __byte_perm (w[26], w[25], selector); + w[50] = __byte_perm (w[25], w[24], selector); + w[49] = __byte_perm (w[24], w[23], selector); + w[48] = __byte_perm (w[23], w[22], selector); + w[47] = __byte_perm (w[22], w[21], selector); + w[46] = __byte_perm (w[21], w[20], selector); + w[45] = __byte_perm (w[20], w[19], selector); + w[44] = __byte_perm (w[19], w[18], selector); + w[43] = __byte_perm (w[18], w[17], selector); + w[42] = __byte_perm (w[17], w[16], selector); + w[41] = __byte_perm (w[16], w[15], selector); + w[40] = __byte_perm (w[15], w[14], selector); + w[39] = __byte_perm (w[14], w[13], selector); + w[38] = __byte_perm (w[13], w[12], selector); + w[37] = __byte_perm (w[12], w[11], selector); + w[36] = __byte_perm (w[11], w[10], selector); + w[35] = __byte_perm (w[10], w[ 9], selector); + w[34] = __byte_perm (w[ 9], w[ 8], selector); + w[33] = __byte_perm (w[ 8], w[ 7], selector); + w[32] = __byte_perm (w[ 7], w[ 6], selector); + w[31] = __byte_perm (w[ 6], w[ 5], selector); + w[30] = __byte_perm (w[ 5], w[ 4], selector); + w[29] = __byte_perm (w[ 4], w[ 3], selector); + w[28] = __byte_perm (w[ 3], w[ 2], selector); + w[27] = __byte_perm (w[ 2], w[ 1], selector); + w[26] = __byte_perm (w[ 1], w[ 0], selector); + w[25] = __byte_perm (w[ 0], 0, selector); + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 26: + w[63] = __byte_perm (w[37], w[36], selector); + w[62] = __byte_perm (w[36], w[35], selector); + w[61] = __byte_perm (w[35], w[34], selector); + w[60] = __byte_perm (w[34], w[33], selector); + w[59] = __byte_perm (w[33], w[32], selector); + w[58] = __byte_perm (w[32], w[31], selector); + w[57] = __byte_perm (w[31], w[30], selector); + w[56] = __byte_perm (w[30], w[29], selector); + w[55] = __byte_perm (w[29], w[28], selector); + w[54] = __byte_perm (w[28], w[27], selector); + w[53] = __byte_perm (w[27], w[26], selector); + w[52] = __byte_perm (w[26], w[25], selector); + w[51] = __byte_perm (w[25], w[24], selector); + w[50] = __byte_perm (w[24], w[23], selector); + w[49] = __byte_perm (w[23], w[22], selector); + w[48] = __byte_perm (w[22], w[21], selector); + w[47] = __byte_perm (w[21], w[20], selector); + w[46] = __byte_perm (w[20], w[19], selector); + w[45] = __byte_perm (w[19], w[18], selector); + w[44] = __byte_perm (w[18], w[17], selector); + w[43] = __byte_perm (w[17], w[16], selector); + w[42] = __byte_perm (w[16], w[15], selector); + w[41] = __byte_perm (w[15], w[14], selector); + w[40] = __byte_perm (w[14], w[13], selector); + w[39] = __byte_perm (w[13], w[12], selector); + w[38] = __byte_perm (w[12], w[11], selector); + w[37] = __byte_perm (w[11], w[10], selector); + w[36] = __byte_perm (w[10], w[ 9], selector); + w[35] = __byte_perm (w[ 9], w[ 8], selector); + w[34] = __byte_perm (w[ 8], w[ 7], selector); + w[33] = __byte_perm (w[ 7], w[ 6], selector); + w[32] = __byte_perm (w[ 6], w[ 5], selector); + w[31] = __byte_perm (w[ 5], w[ 4], selector); + w[30] = __byte_perm (w[ 4], w[ 3], selector); + w[29] = __byte_perm (w[ 3], w[ 2], selector); + w[28] = __byte_perm (w[ 2], w[ 1], selector); + w[27] = __byte_perm (w[ 1], w[ 0], selector); + w[26] = __byte_perm (w[ 0], 0, selector); + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 27: + w[63] = __byte_perm (w[36], w[35], selector); + w[62] = __byte_perm (w[35], w[34], selector); + w[61] = __byte_perm (w[34], w[33], selector); + w[60] = __byte_perm (w[33], w[32], selector); + w[59] = __byte_perm (w[32], w[31], selector); + w[58] = __byte_perm (w[31], w[30], selector); + w[57] = __byte_perm (w[30], w[29], selector); + w[56] = __byte_perm (w[29], w[28], selector); + w[55] = __byte_perm (w[28], w[27], selector); + w[54] = __byte_perm (w[27], w[26], selector); + w[53] = __byte_perm (w[26], w[25], selector); + w[52] = __byte_perm (w[25], w[24], selector); + w[51] = __byte_perm (w[24], w[23], selector); + w[50] = __byte_perm (w[23], w[22], selector); + w[49] = __byte_perm (w[22], w[21], selector); + w[48] = __byte_perm (w[21], w[20], selector); + w[47] = __byte_perm (w[20], w[19], selector); + w[46] = __byte_perm (w[19], w[18], selector); + w[45] = __byte_perm (w[18], w[17], selector); + w[44] = __byte_perm (w[17], w[16], selector); + w[43] = __byte_perm (w[16], w[15], selector); + w[42] = __byte_perm (w[15], w[14], selector); + w[41] = __byte_perm (w[14], w[13], selector); + w[40] = __byte_perm (w[13], w[12], selector); + w[39] = __byte_perm (w[12], w[11], selector); + w[38] = __byte_perm (w[11], w[10], selector); + w[37] = __byte_perm (w[10], w[ 9], selector); + w[36] = __byte_perm (w[ 9], w[ 8], selector); + w[35] = __byte_perm (w[ 8], w[ 7], selector); + w[34] = __byte_perm (w[ 7], w[ 6], selector); + w[33] = __byte_perm (w[ 6], w[ 5], selector); + w[32] = __byte_perm (w[ 5], w[ 4], selector); + w[31] = __byte_perm (w[ 4], w[ 3], selector); + w[30] = __byte_perm (w[ 3], w[ 2], selector); + w[29] = __byte_perm (w[ 2], w[ 1], selector); + w[28] = __byte_perm (w[ 1], w[ 0], selector); + w[27] = __byte_perm (w[ 0], 0, selector); + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 28: + w[63] = __byte_perm (w[35], w[34], selector); + w[62] = __byte_perm (w[34], w[33], selector); + w[61] = __byte_perm (w[33], w[32], selector); + w[60] = __byte_perm (w[32], w[31], selector); + w[59] = __byte_perm (w[31], w[30], selector); + w[58] = __byte_perm (w[30], w[29], selector); + w[57] = __byte_perm (w[29], w[28], selector); + w[56] = __byte_perm (w[28], w[27], selector); + w[55] = __byte_perm (w[27], w[26], selector); + w[54] = __byte_perm (w[26], w[25], selector); + w[53] = __byte_perm (w[25], w[24], selector); + w[52] = __byte_perm (w[24], w[23], selector); + w[51] = __byte_perm (w[23], w[22], selector); + w[50] = __byte_perm (w[22], w[21], selector); + w[49] = __byte_perm (w[21], w[20], selector); + w[48] = __byte_perm (w[20], w[19], selector); + w[47] = __byte_perm (w[19], w[18], selector); + w[46] = __byte_perm (w[18], w[17], selector); + w[45] = __byte_perm (w[17], w[16], selector); + w[44] = __byte_perm (w[16], w[15], selector); + w[43] = __byte_perm (w[15], w[14], selector); + w[42] = __byte_perm (w[14], w[13], selector); + w[41] = __byte_perm (w[13], w[12], selector); + w[40] = __byte_perm (w[12], w[11], selector); + w[39] = __byte_perm (w[11], w[10], selector); + w[38] = __byte_perm (w[10], w[ 9], selector); + w[37] = __byte_perm (w[ 9], w[ 8], selector); + w[36] = __byte_perm (w[ 8], w[ 7], selector); + w[35] = __byte_perm (w[ 7], w[ 6], selector); + w[34] = __byte_perm (w[ 6], w[ 5], selector); + w[33] = __byte_perm (w[ 5], w[ 4], selector); + w[32] = __byte_perm (w[ 4], w[ 3], selector); + w[31] = __byte_perm (w[ 3], w[ 2], selector); + w[30] = __byte_perm (w[ 2], w[ 1], selector); + w[29] = __byte_perm (w[ 1], w[ 0], selector); + w[28] = __byte_perm (w[ 0], 0, selector); + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 29: + w[63] = __byte_perm (w[34], w[33], selector); + w[62] = __byte_perm (w[33], w[32], selector); + w[61] = __byte_perm (w[32], w[31], selector); + w[60] = __byte_perm (w[31], w[30], selector); + w[59] = __byte_perm (w[30], w[29], selector); + w[58] = __byte_perm (w[29], w[28], selector); + w[57] = __byte_perm (w[28], w[27], selector); + w[56] = __byte_perm (w[27], w[26], selector); + w[55] = __byte_perm (w[26], w[25], selector); + w[54] = __byte_perm (w[25], w[24], selector); + w[53] = __byte_perm (w[24], w[23], selector); + w[52] = __byte_perm (w[23], w[22], selector); + w[51] = __byte_perm (w[22], w[21], selector); + w[50] = __byte_perm (w[21], w[20], selector); + w[49] = __byte_perm (w[20], w[19], selector); + w[48] = __byte_perm (w[19], w[18], selector); + w[47] = __byte_perm (w[18], w[17], selector); + w[46] = __byte_perm (w[17], w[16], selector); + w[45] = __byte_perm (w[16], w[15], selector); + w[44] = __byte_perm (w[15], w[14], selector); + w[43] = __byte_perm (w[14], w[13], selector); + w[42] = __byte_perm (w[13], w[12], selector); + w[41] = __byte_perm (w[12], w[11], selector); + w[40] = __byte_perm (w[11], w[10], selector); + w[39] = __byte_perm (w[10], w[ 9], selector); + w[38] = __byte_perm (w[ 9], w[ 8], selector); + w[37] = __byte_perm (w[ 8], w[ 7], selector); + w[36] = __byte_perm (w[ 7], w[ 6], selector); + w[35] = __byte_perm (w[ 6], w[ 5], selector); + w[34] = __byte_perm (w[ 5], w[ 4], selector); + w[33] = __byte_perm (w[ 4], w[ 3], selector); + w[32] = __byte_perm (w[ 3], w[ 2], selector); + w[31] = __byte_perm (w[ 2], w[ 1], selector); + w[30] = __byte_perm (w[ 1], w[ 0], selector); + w[29] = __byte_perm (w[ 0], 0, selector); + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 30: + w[63] = __byte_perm (w[33], w[32], selector); + w[62] = __byte_perm (w[32], w[31], selector); + w[61] = __byte_perm (w[31], w[30], selector); + w[60] = __byte_perm (w[30], w[29], selector); + w[59] = __byte_perm (w[29], w[28], selector); + w[58] = __byte_perm (w[28], w[27], selector); + w[57] = __byte_perm (w[27], w[26], selector); + w[56] = __byte_perm (w[26], w[25], selector); + w[55] = __byte_perm (w[25], w[24], selector); + w[54] = __byte_perm (w[24], w[23], selector); + w[53] = __byte_perm (w[23], w[22], selector); + w[52] = __byte_perm (w[22], w[21], selector); + w[51] = __byte_perm (w[21], w[20], selector); + w[50] = __byte_perm (w[20], w[19], selector); + w[49] = __byte_perm (w[19], w[18], selector); + w[48] = __byte_perm (w[18], w[17], selector); + w[47] = __byte_perm (w[17], w[16], selector); + w[46] = __byte_perm (w[16], w[15], selector); + w[45] = __byte_perm (w[15], w[14], selector); + w[44] = __byte_perm (w[14], w[13], selector); + w[43] = __byte_perm (w[13], w[12], selector); + w[42] = __byte_perm (w[12], w[11], selector); + w[41] = __byte_perm (w[11], w[10], selector); + w[40] = __byte_perm (w[10], w[ 9], selector); + w[39] = __byte_perm (w[ 9], w[ 8], selector); + w[38] = __byte_perm (w[ 8], w[ 7], selector); + w[37] = __byte_perm (w[ 7], w[ 6], selector); + w[36] = __byte_perm (w[ 6], w[ 5], selector); + w[35] = __byte_perm (w[ 5], w[ 4], selector); + w[34] = __byte_perm (w[ 4], w[ 3], selector); + w[33] = __byte_perm (w[ 3], w[ 2], selector); + w[32] = __byte_perm (w[ 2], w[ 1], selector); + w[31] = __byte_perm (w[ 1], w[ 0], selector); + w[30] = __byte_perm (w[ 0], 0, selector); + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 31: + w[63] = __byte_perm (w[32], w[31], selector); + w[62] = __byte_perm (w[31], w[30], selector); + w[61] = __byte_perm (w[30], w[29], selector); + w[60] = __byte_perm (w[29], w[28], selector); + w[59] = __byte_perm (w[28], w[27], selector); + w[58] = __byte_perm (w[27], w[26], selector); + w[57] = __byte_perm (w[26], w[25], selector); + w[56] = __byte_perm (w[25], w[24], selector); + w[55] = __byte_perm (w[24], w[23], selector); + w[54] = __byte_perm (w[23], w[22], selector); + w[53] = __byte_perm (w[22], w[21], selector); + w[52] = __byte_perm (w[21], w[20], selector); + w[51] = __byte_perm (w[20], w[19], selector); + w[50] = __byte_perm (w[19], w[18], selector); + w[49] = __byte_perm (w[18], w[17], selector); + w[48] = __byte_perm (w[17], w[16], selector); + w[47] = __byte_perm (w[16], w[15], selector); + w[46] = __byte_perm (w[15], w[14], selector); + w[45] = __byte_perm (w[14], w[13], selector); + w[44] = __byte_perm (w[13], w[12], selector); + w[43] = __byte_perm (w[12], w[11], selector); + w[42] = __byte_perm (w[11], w[10], selector); + w[41] = __byte_perm (w[10], w[ 9], selector); + w[40] = __byte_perm (w[ 9], w[ 8], selector); + w[39] = __byte_perm (w[ 8], w[ 7], selector); + w[38] = __byte_perm (w[ 7], w[ 6], selector); + w[37] = __byte_perm (w[ 6], w[ 5], selector); + w[36] = __byte_perm (w[ 5], w[ 4], selector); + w[35] = __byte_perm (w[ 4], w[ 3], selector); + w[34] = __byte_perm (w[ 3], w[ 2], selector); + w[33] = __byte_perm (w[ 2], w[ 1], selector); + w[32] = __byte_perm (w[ 1], w[ 0], selector); + w[31] = __byte_perm (w[ 0], 0, selector); + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 32: + w[63] = __byte_perm (w[31], w[30], selector); + w[62] = __byte_perm (w[30], w[29], selector); + w[61] = __byte_perm (w[29], w[28], selector); + w[60] = __byte_perm (w[28], w[27], selector); + w[59] = __byte_perm (w[27], w[26], selector); + w[58] = __byte_perm (w[26], w[25], selector); + w[57] = __byte_perm (w[25], w[24], selector); + w[56] = __byte_perm (w[24], w[23], selector); + w[55] = __byte_perm (w[23], w[22], selector); + w[54] = __byte_perm (w[22], w[21], selector); + w[53] = __byte_perm (w[21], w[20], selector); + w[52] = __byte_perm (w[20], w[19], selector); + w[51] = __byte_perm (w[19], w[18], selector); + w[50] = __byte_perm (w[18], w[17], selector); + w[49] = __byte_perm (w[17], w[16], selector); + w[48] = __byte_perm (w[16], w[15], selector); + w[47] = __byte_perm (w[15], w[14], selector); + w[46] = __byte_perm (w[14], w[13], selector); + w[45] = __byte_perm (w[13], w[12], selector); + w[44] = __byte_perm (w[12], w[11], selector); + w[43] = __byte_perm (w[11], w[10], selector); + w[42] = __byte_perm (w[10], w[ 9], selector); + w[41] = __byte_perm (w[ 9], w[ 8], selector); + w[40] = __byte_perm (w[ 8], w[ 7], selector); + w[39] = __byte_perm (w[ 7], w[ 6], selector); + w[38] = __byte_perm (w[ 6], w[ 5], selector); + w[37] = __byte_perm (w[ 5], w[ 4], selector); + w[36] = __byte_perm (w[ 4], w[ 3], selector); + w[35] = __byte_perm (w[ 3], w[ 2], selector); + w[34] = __byte_perm (w[ 2], w[ 1], selector); + w[33] = __byte_perm (w[ 1], w[ 0], selector); + w[32] = __byte_perm (w[ 0], 0, selector); + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 33: + w[63] = __byte_perm (w[30], w[29], selector); + w[62] = __byte_perm (w[29], w[28], selector); + w[61] = __byte_perm (w[28], w[27], selector); + w[60] = __byte_perm (w[27], w[26], selector); + w[59] = __byte_perm (w[26], w[25], selector); + w[58] = __byte_perm (w[25], w[24], selector); + w[57] = __byte_perm (w[24], w[23], selector); + w[56] = __byte_perm (w[23], w[22], selector); + w[55] = __byte_perm (w[22], w[21], selector); + w[54] = __byte_perm (w[21], w[20], selector); + w[53] = __byte_perm (w[20], w[19], selector); + w[52] = __byte_perm (w[19], w[18], selector); + w[51] = __byte_perm (w[18], w[17], selector); + w[50] = __byte_perm (w[17], w[16], selector); + w[49] = __byte_perm (w[16], w[15], selector); + w[48] = __byte_perm (w[15], w[14], selector); + w[47] = __byte_perm (w[14], w[13], selector); + w[46] = __byte_perm (w[13], w[12], selector); + w[45] = __byte_perm (w[12], w[11], selector); + w[44] = __byte_perm (w[11], w[10], selector); + w[43] = __byte_perm (w[10], w[ 9], selector); + w[42] = __byte_perm (w[ 9], w[ 8], selector); + w[41] = __byte_perm (w[ 8], w[ 7], selector); + w[40] = __byte_perm (w[ 7], w[ 6], selector); + w[39] = __byte_perm (w[ 6], w[ 5], selector); + w[38] = __byte_perm (w[ 5], w[ 4], selector); + w[37] = __byte_perm (w[ 4], w[ 3], selector); + w[36] = __byte_perm (w[ 3], w[ 2], selector); + w[35] = __byte_perm (w[ 2], w[ 1], selector); + w[34] = __byte_perm (w[ 1], w[ 0], selector); + w[33] = __byte_perm (w[ 0], 0, selector); + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 34: + w[63] = __byte_perm (w[29], w[28], selector); + w[62] = __byte_perm (w[28], w[27], selector); + w[61] = __byte_perm (w[27], w[26], selector); + w[60] = __byte_perm (w[26], w[25], selector); + w[59] = __byte_perm (w[25], w[24], selector); + w[58] = __byte_perm (w[24], w[23], selector); + w[57] = __byte_perm (w[23], w[22], selector); + w[56] = __byte_perm (w[22], w[21], selector); + w[55] = __byte_perm (w[21], w[20], selector); + w[54] = __byte_perm (w[20], w[19], selector); + w[53] = __byte_perm (w[19], w[18], selector); + w[52] = __byte_perm (w[18], w[17], selector); + w[51] = __byte_perm (w[17], w[16], selector); + w[50] = __byte_perm (w[16], w[15], selector); + w[49] = __byte_perm (w[15], w[14], selector); + w[48] = __byte_perm (w[14], w[13], selector); + w[47] = __byte_perm (w[13], w[12], selector); + w[46] = __byte_perm (w[12], w[11], selector); + w[45] = __byte_perm (w[11], w[10], selector); + w[44] = __byte_perm (w[10], w[ 9], selector); + w[43] = __byte_perm (w[ 9], w[ 8], selector); + w[42] = __byte_perm (w[ 8], w[ 7], selector); + w[41] = __byte_perm (w[ 7], w[ 6], selector); + w[40] = __byte_perm (w[ 6], w[ 5], selector); + w[39] = __byte_perm (w[ 5], w[ 4], selector); + w[38] = __byte_perm (w[ 4], w[ 3], selector); + w[37] = __byte_perm (w[ 3], w[ 2], selector); + w[36] = __byte_perm (w[ 2], w[ 1], selector); + w[35] = __byte_perm (w[ 1], w[ 0], selector); + w[34] = __byte_perm (w[ 0], 0, selector); + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 35: + w[63] = __byte_perm (w[28], w[27], selector); + w[62] = __byte_perm (w[27], w[26], selector); + w[61] = __byte_perm (w[26], w[25], selector); + w[60] = __byte_perm (w[25], w[24], selector); + w[59] = __byte_perm (w[24], w[23], selector); + w[58] = __byte_perm (w[23], w[22], selector); + w[57] = __byte_perm (w[22], w[21], selector); + w[56] = __byte_perm (w[21], w[20], selector); + w[55] = __byte_perm (w[20], w[19], selector); + w[54] = __byte_perm (w[19], w[18], selector); + w[53] = __byte_perm (w[18], w[17], selector); + w[52] = __byte_perm (w[17], w[16], selector); + w[51] = __byte_perm (w[16], w[15], selector); + w[50] = __byte_perm (w[15], w[14], selector); + w[49] = __byte_perm (w[14], w[13], selector); + w[48] = __byte_perm (w[13], w[12], selector); + w[47] = __byte_perm (w[12], w[11], selector); + w[46] = __byte_perm (w[11], w[10], selector); + w[45] = __byte_perm (w[10], w[ 9], selector); + w[44] = __byte_perm (w[ 9], w[ 8], selector); + w[43] = __byte_perm (w[ 8], w[ 7], selector); + w[42] = __byte_perm (w[ 7], w[ 6], selector); + w[41] = __byte_perm (w[ 6], w[ 5], selector); + w[40] = __byte_perm (w[ 5], w[ 4], selector); + w[39] = __byte_perm (w[ 4], w[ 3], selector); + w[38] = __byte_perm (w[ 3], w[ 2], selector); + w[37] = __byte_perm (w[ 2], w[ 1], selector); + w[36] = __byte_perm (w[ 1], w[ 0], selector); + w[35] = __byte_perm (w[ 0], 0, selector); + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 36: + w[63] = __byte_perm (w[27], w[26], selector); + w[62] = __byte_perm (w[26], w[25], selector); + w[61] = __byte_perm (w[25], w[24], selector); + w[60] = __byte_perm (w[24], w[23], selector); + w[59] = __byte_perm (w[23], w[22], selector); + w[58] = __byte_perm (w[22], w[21], selector); + w[57] = __byte_perm (w[21], w[20], selector); + w[56] = __byte_perm (w[20], w[19], selector); + w[55] = __byte_perm (w[19], w[18], selector); + w[54] = __byte_perm (w[18], w[17], selector); + w[53] = __byte_perm (w[17], w[16], selector); + w[52] = __byte_perm (w[16], w[15], selector); + w[51] = __byte_perm (w[15], w[14], selector); + w[50] = __byte_perm (w[14], w[13], selector); + w[49] = __byte_perm (w[13], w[12], selector); + w[48] = __byte_perm (w[12], w[11], selector); + w[47] = __byte_perm (w[11], w[10], selector); + w[46] = __byte_perm (w[10], w[ 9], selector); + w[45] = __byte_perm (w[ 9], w[ 8], selector); + w[44] = __byte_perm (w[ 8], w[ 7], selector); + w[43] = __byte_perm (w[ 7], w[ 6], selector); + w[42] = __byte_perm (w[ 6], w[ 5], selector); + w[41] = __byte_perm (w[ 5], w[ 4], selector); + w[40] = __byte_perm (w[ 4], w[ 3], selector); + w[39] = __byte_perm (w[ 3], w[ 2], selector); + w[38] = __byte_perm (w[ 2], w[ 1], selector); + w[37] = __byte_perm (w[ 1], w[ 0], selector); + w[36] = __byte_perm (w[ 0], 0, selector); + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 37: + w[63] = __byte_perm (w[26], w[25], selector); + w[62] = __byte_perm (w[25], w[24], selector); + w[61] = __byte_perm (w[24], w[23], selector); + w[60] = __byte_perm (w[23], w[22], selector); + w[59] = __byte_perm (w[22], w[21], selector); + w[58] = __byte_perm (w[21], w[20], selector); + w[57] = __byte_perm (w[20], w[19], selector); + w[56] = __byte_perm (w[19], w[18], selector); + w[55] = __byte_perm (w[18], w[17], selector); + w[54] = __byte_perm (w[17], w[16], selector); + w[53] = __byte_perm (w[16], w[15], selector); + w[52] = __byte_perm (w[15], w[14], selector); + w[51] = __byte_perm (w[14], w[13], selector); + w[50] = __byte_perm (w[13], w[12], selector); + w[49] = __byte_perm (w[12], w[11], selector); + w[48] = __byte_perm (w[11], w[10], selector); + w[47] = __byte_perm (w[10], w[ 9], selector); + w[46] = __byte_perm (w[ 9], w[ 8], selector); + w[45] = __byte_perm (w[ 8], w[ 7], selector); + w[44] = __byte_perm (w[ 7], w[ 6], selector); + w[43] = __byte_perm (w[ 6], w[ 5], selector); + w[42] = __byte_perm (w[ 5], w[ 4], selector); + w[41] = __byte_perm (w[ 4], w[ 3], selector); + w[40] = __byte_perm (w[ 3], w[ 2], selector); + w[39] = __byte_perm (w[ 2], w[ 1], selector); + w[38] = __byte_perm (w[ 1], w[ 0], selector); + w[37] = __byte_perm (w[ 0], 0, selector); + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 38: + w[63] = __byte_perm (w[25], w[24], selector); + w[62] = __byte_perm (w[24], w[23], selector); + w[61] = __byte_perm (w[23], w[22], selector); + w[60] = __byte_perm (w[22], w[21], selector); + w[59] = __byte_perm (w[21], w[20], selector); + w[58] = __byte_perm (w[20], w[19], selector); + w[57] = __byte_perm (w[19], w[18], selector); + w[56] = __byte_perm (w[18], w[17], selector); + w[55] = __byte_perm (w[17], w[16], selector); + w[54] = __byte_perm (w[16], w[15], selector); + w[53] = __byte_perm (w[15], w[14], selector); + w[52] = __byte_perm (w[14], w[13], selector); + w[51] = __byte_perm (w[13], w[12], selector); + w[50] = __byte_perm (w[12], w[11], selector); + w[49] = __byte_perm (w[11], w[10], selector); + w[48] = __byte_perm (w[10], w[ 9], selector); + w[47] = __byte_perm (w[ 9], w[ 8], selector); + w[46] = __byte_perm (w[ 8], w[ 7], selector); + w[45] = __byte_perm (w[ 7], w[ 6], selector); + w[44] = __byte_perm (w[ 6], w[ 5], selector); + w[43] = __byte_perm (w[ 5], w[ 4], selector); + w[42] = __byte_perm (w[ 4], w[ 3], selector); + w[41] = __byte_perm (w[ 3], w[ 2], selector); + w[40] = __byte_perm (w[ 2], w[ 1], selector); + w[39] = __byte_perm (w[ 1], w[ 0], selector); + w[38] = __byte_perm (w[ 0], 0, selector); + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 39: + w[63] = __byte_perm (w[24], w[23], selector); + w[62] = __byte_perm (w[23], w[22], selector); + w[61] = __byte_perm (w[22], w[21], selector); + w[60] = __byte_perm (w[21], w[20], selector); + w[59] = __byte_perm (w[20], w[19], selector); + w[58] = __byte_perm (w[19], w[18], selector); + w[57] = __byte_perm (w[18], w[17], selector); + w[56] = __byte_perm (w[17], w[16], selector); + w[55] = __byte_perm (w[16], w[15], selector); + w[54] = __byte_perm (w[15], w[14], selector); + w[53] = __byte_perm (w[14], w[13], selector); + w[52] = __byte_perm (w[13], w[12], selector); + w[51] = __byte_perm (w[12], w[11], selector); + w[50] = __byte_perm (w[11], w[10], selector); + w[49] = __byte_perm (w[10], w[ 9], selector); + w[48] = __byte_perm (w[ 9], w[ 8], selector); + w[47] = __byte_perm (w[ 8], w[ 7], selector); + w[46] = __byte_perm (w[ 7], w[ 6], selector); + w[45] = __byte_perm (w[ 6], w[ 5], selector); + w[44] = __byte_perm (w[ 5], w[ 4], selector); + w[43] = __byte_perm (w[ 4], w[ 3], selector); + w[42] = __byte_perm (w[ 3], w[ 2], selector); + w[41] = __byte_perm (w[ 2], w[ 1], selector); + w[40] = __byte_perm (w[ 1], w[ 0], selector); + w[39] = __byte_perm (w[ 0], 0, selector); + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 40: + w[63] = __byte_perm (w[23], w[22], selector); + w[62] = __byte_perm (w[22], w[21], selector); + w[61] = __byte_perm (w[21], w[20], selector); + w[60] = __byte_perm (w[20], w[19], selector); + w[59] = __byte_perm (w[19], w[18], selector); + w[58] = __byte_perm (w[18], w[17], selector); + w[57] = __byte_perm (w[17], w[16], selector); + w[56] = __byte_perm (w[16], w[15], selector); + w[55] = __byte_perm (w[15], w[14], selector); + w[54] = __byte_perm (w[14], w[13], selector); + w[53] = __byte_perm (w[13], w[12], selector); + w[52] = __byte_perm (w[12], w[11], selector); + w[51] = __byte_perm (w[11], w[10], selector); + w[50] = __byte_perm (w[10], w[ 9], selector); + w[49] = __byte_perm (w[ 9], w[ 8], selector); + w[48] = __byte_perm (w[ 8], w[ 7], selector); + w[47] = __byte_perm (w[ 7], w[ 6], selector); + w[46] = __byte_perm (w[ 6], w[ 5], selector); + w[45] = __byte_perm (w[ 5], w[ 4], selector); + w[44] = __byte_perm (w[ 4], w[ 3], selector); + w[43] = __byte_perm (w[ 3], w[ 2], selector); + w[42] = __byte_perm (w[ 2], w[ 1], selector); + w[41] = __byte_perm (w[ 1], w[ 0], selector); + w[40] = __byte_perm (w[ 0], 0, selector); + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 41: + w[63] = __byte_perm (w[22], w[21], selector); + w[62] = __byte_perm (w[21], w[20], selector); + w[61] = __byte_perm (w[20], w[19], selector); + w[60] = __byte_perm (w[19], w[18], selector); + w[59] = __byte_perm (w[18], w[17], selector); + w[58] = __byte_perm (w[17], w[16], selector); + w[57] = __byte_perm (w[16], w[15], selector); + w[56] = __byte_perm (w[15], w[14], selector); + w[55] = __byte_perm (w[14], w[13], selector); + w[54] = __byte_perm (w[13], w[12], selector); + w[53] = __byte_perm (w[12], w[11], selector); + w[52] = __byte_perm (w[11], w[10], selector); + w[51] = __byte_perm (w[10], w[ 9], selector); + w[50] = __byte_perm (w[ 9], w[ 8], selector); + w[49] = __byte_perm (w[ 8], w[ 7], selector); + w[48] = __byte_perm (w[ 7], w[ 6], selector); + w[47] = __byte_perm (w[ 6], w[ 5], selector); + w[46] = __byte_perm (w[ 5], w[ 4], selector); + w[45] = __byte_perm (w[ 4], w[ 3], selector); + w[44] = __byte_perm (w[ 3], w[ 2], selector); + w[43] = __byte_perm (w[ 2], w[ 1], selector); + w[42] = __byte_perm (w[ 1], w[ 0], selector); + w[41] = __byte_perm (w[ 0], 0, selector); + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 42: + w[63] = __byte_perm (w[21], w[20], selector); + w[62] = __byte_perm (w[20], w[19], selector); + w[61] = __byte_perm (w[19], w[18], selector); + w[60] = __byte_perm (w[18], w[17], selector); + w[59] = __byte_perm (w[17], w[16], selector); + w[58] = __byte_perm (w[16], w[15], selector); + w[57] = __byte_perm (w[15], w[14], selector); + w[56] = __byte_perm (w[14], w[13], selector); + w[55] = __byte_perm (w[13], w[12], selector); + w[54] = __byte_perm (w[12], w[11], selector); + w[53] = __byte_perm (w[11], w[10], selector); + w[52] = __byte_perm (w[10], w[ 9], selector); + w[51] = __byte_perm (w[ 9], w[ 8], selector); + w[50] = __byte_perm (w[ 8], w[ 7], selector); + w[49] = __byte_perm (w[ 7], w[ 6], selector); + w[48] = __byte_perm (w[ 6], w[ 5], selector); + w[47] = __byte_perm (w[ 5], w[ 4], selector); + w[46] = __byte_perm (w[ 4], w[ 3], selector); + w[45] = __byte_perm (w[ 3], w[ 2], selector); + w[44] = __byte_perm (w[ 2], w[ 1], selector); + w[43] = __byte_perm (w[ 1], w[ 0], selector); + w[42] = __byte_perm (w[ 0], 0, selector); + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 43: + w[63] = __byte_perm (w[20], w[19], selector); + w[62] = __byte_perm (w[19], w[18], selector); + w[61] = __byte_perm (w[18], w[17], selector); + w[60] = __byte_perm (w[17], w[16], selector); + w[59] = __byte_perm (w[16], w[15], selector); + w[58] = __byte_perm (w[15], w[14], selector); + w[57] = __byte_perm (w[14], w[13], selector); + w[56] = __byte_perm (w[13], w[12], selector); + w[55] = __byte_perm (w[12], w[11], selector); + w[54] = __byte_perm (w[11], w[10], selector); + w[53] = __byte_perm (w[10], w[ 9], selector); + w[52] = __byte_perm (w[ 9], w[ 8], selector); + w[51] = __byte_perm (w[ 8], w[ 7], selector); + w[50] = __byte_perm (w[ 7], w[ 6], selector); + w[49] = __byte_perm (w[ 6], w[ 5], selector); + w[48] = __byte_perm (w[ 5], w[ 4], selector); + w[47] = __byte_perm (w[ 4], w[ 3], selector); + w[46] = __byte_perm (w[ 3], w[ 2], selector); + w[45] = __byte_perm (w[ 2], w[ 1], selector); + w[44] = __byte_perm (w[ 1], w[ 0], selector); + w[43] = __byte_perm (w[ 0], 0, selector); + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 44: + w[63] = __byte_perm (w[19], w[18], selector); + w[62] = __byte_perm (w[18], w[17], selector); + w[61] = __byte_perm (w[17], w[16], selector); + w[60] = __byte_perm (w[16], w[15], selector); + w[59] = __byte_perm (w[15], w[14], selector); + w[58] = __byte_perm (w[14], w[13], selector); + w[57] = __byte_perm (w[13], w[12], selector); + w[56] = __byte_perm (w[12], w[11], selector); + w[55] = __byte_perm (w[11], w[10], selector); + w[54] = __byte_perm (w[10], w[ 9], selector); + w[53] = __byte_perm (w[ 9], w[ 8], selector); + w[52] = __byte_perm (w[ 8], w[ 7], selector); + w[51] = __byte_perm (w[ 7], w[ 6], selector); + w[50] = __byte_perm (w[ 6], w[ 5], selector); + w[49] = __byte_perm (w[ 5], w[ 4], selector); + w[48] = __byte_perm (w[ 4], w[ 3], selector); + w[47] = __byte_perm (w[ 3], w[ 2], selector); + w[46] = __byte_perm (w[ 2], w[ 1], selector); + w[45] = __byte_perm (w[ 1], w[ 0], selector); + w[44] = __byte_perm (w[ 0], 0, selector); + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 45: + w[63] = __byte_perm (w[18], w[17], selector); + w[62] = __byte_perm (w[17], w[16], selector); + w[61] = __byte_perm (w[16], w[15], selector); + w[60] = __byte_perm (w[15], w[14], selector); + w[59] = __byte_perm (w[14], w[13], selector); + w[58] = __byte_perm (w[13], w[12], selector); + w[57] = __byte_perm (w[12], w[11], selector); + w[56] = __byte_perm (w[11], w[10], selector); + w[55] = __byte_perm (w[10], w[ 9], selector); + w[54] = __byte_perm (w[ 9], w[ 8], selector); + w[53] = __byte_perm (w[ 8], w[ 7], selector); + w[52] = __byte_perm (w[ 7], w[ 6], selector); + w[51] = __byte_perm (w[ 6], w[ 5], selector); + w[50] = __byte_perm (w[ 5], w[ 4], selector); + w[49] = __byte_perm (w[ 4], w[ 3], selector); + w[48] = __byte_perm (w[ 3], w[ 2], selector); + w[47] = __byte_perm (w[ 2], w[ 1], selector); + w[46] = __byte_perm (w[ 1], w[ 0], selector); + w[45] = __byte_perm (w[ 0], 0, selector); + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 46: + w[63] = __byte_perm (w[17], w[16], selector); + w[62] = __byte_perm (w[16], w[15], selector); + w[61] = __byte_perm (w[15], w[14], selector); + w[60] = __byte_perm (w[14], w[13], selector); + w[59] = __byte_perm (w[13], w[12], selector); + w[58] = __byte_perm (w[12], w[11], selector); + w[57] = __byte_perm (w[11], w[10], selector); + w[56] = __byte_perm (w[10], w[ 9], selector); + w[55] = __byte_perm (w[ 9], w[ 8], selector); + w[54] = __byte_perm (w[ 8], w[ 7], selector); + w[53] = __byte_perm (w[ 7], w[ 6], selector); + w[52] = __byte_perm (w[ 6], w[ 5], selector); + w[51] = __byte_perm (w[ 5], w[ 4], selector); + w[50] = __byte_perm (w[ 4], w[ 3], selector); + w[49] = __byte_perm (w[ 3], w[ 2], selector); + w[48] = __byte_perm (w[ 2], w[ 1], selector); + w[47] = __byte_perm (w[ 1], w[ 0], selector); + w[46] = __byte_perm (w[ 0], 0, selector); + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 47: + w[63] = __byte_perm (w[16], w[15], selector); + w[62] = __byte_perm (w[15], w[14], selector); + w[61] = __byte_perm (w[14], w[13], selector); + w[60] = __byte_perm (w[13], w[12], selector); + w[59] = __byte_perm (w[12], w[11], selector); + w[58] = __byte_perm (w[11], w[10], selector); + w[57] = __byte_perm (w[10], w[ 9], selector); + w[56] = __byte_perm (w[ 9], w[ 8], selector); + w[55] = __byte_perm (w[ 8], w[ 7], selector); + w[54] = __byte_perm (w[ 7], w[ 6], selector); + w[53] = __byte_perm (w[ 6], w[ 5], selector); + w[52] = __byte_perm (w[ 5], w[ 4], selector); + w[51] = __byte_perm (w[ 4], w[ 3], selector); + w[50] = __byte_perm (w[ 3], w[ 2], selector); + w[49] = __byte_perm (w[ 2], w[ 1], selector); + w[48] = __byte_perm (w[ 1], w[ 0], selector); + w[47] = __byte_perm (w[ 0], 0, selector); + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 48: + w[63] = __byte_perm (w[15], w[14], selector); + w[62] = __byte_perm (w[14], w[13], selector); + w[61] = __byte_perm (w[13], w[12], selector); + w[60] = __byte_perm (w[12], w[11], selector); + w[59] = __byte_perm (w[11], w[10], selector); + w[58] = __byte_perm (w[10], w[ 9], selector); + w[57] = __byte_perm (w[ 9], w[ 8], selector); + w[56] = __byte_perm (w[ 8], w[ 7], selector); + w[55] = __byte_perm (w[ 7], w[ 6], selector); + w[54] = __byte_perm (w[ 6], w[ 5], selector); + w[53] = __byte_perm (w[ 5], w[ 4], selector); + w[52] = __byte_perm (w[ 4], w[ 3], selector); + w[51] = __byte_perm (w[ 3], w[ 2], selector); + w[50] = __byte_perm (w[ 2], w[ 1], selector); + w[49] = __byte_perm (w[ 1], w[ 0], selector); + w[48] = __byte_perm (w[ 0], 0, selector); + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 49: + w[63] = __byte_perm (w[14], w[13], selector); + w[62] = __byte_perm (w[13], w[12], selector); + w[61] = __byte_perm (w[12], w[11], selector); + w[60] = __byte_perm (w[11], w[10], selector); + w[59] = __byte_perm (w[10], w[ 9], selector); + w[58] = __byte_perm (w[ 9], w[ 8], selector); + w[57] = __byte_perm (w[ 8], w[ 7], selector); + w[56] = __byte_perm (w[ 7], w[ 6], selector); + w[55] = __byte_perm (w[ 6], w[ 5], selector); + w[54] = __byte_perm (w[ 5], w[ 4], selector); + w[53] = __byte_perm (w[ 4], w[ 3], selector); + w[52] = __byte_perm (w[ 3], w[ 2], selector); + w[51] = __byte_perm (w[ 2], w[ 1], selector); + w[50] = __byte_perm (w[ 1], w[ 0], selector); + w[49] = __byte_perm (w[ 0], 0, selector); + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 50: + w[63] = __byte_perm (w[13], w[12], selector); + w[62] = __byte_perm (w[12], w[11], selector); + w[61] = __byte_perm (w[11], w[10], selector); + w[60] = __byte_perm (w[10], w[ 9], selector); + w[59] = __byte_perm (w[ 9], w[ 8], selector); + w[58] = __byte_perm (w[ 8], w[ 7], selector); + w[57] = __byte_perm (w[ 7], w[ 6], selector); + w[56] = __byte_perm (w[ 6], w[ 5], selector); + w[55] = __byte_perm (w[ 5], w[ 4], selector); + w[54] = __byte_perm (w[ 4], w[ 3], selector); + w[53] = __byte_perm (w[ 3], w[ 2], selector); + w[52] = __byte_perm (w[ 2], w[ 1], selector); + w[51] = __byte_perm (w[ 1], w[ 0], selector); + w[50] = __byte_perm (w[ 0], 0, selector); + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 51: + w[63] = __byte_perm (w[12], w[11], selector); + w[62] = __byte_perm (w[11], w[10], selector); + w[61] = __byte_perm (w[10], w[ 9], selector); + w[60] = __byte_perm (w[ 9], w[ 8], selector); + w[59] = __byte_perm (w[ 8], w[ 7], selector); + w[58] = __byte_perm (w[ 7], w[ 6], selector); + w[57] = __byte_perm (w[ 6], w[ 5], selector); + w[56] = __byte_perm (w[ 5], w[ 4], selector); + w[55] = __byte_perm (w[ 4], w[ 3], selector); + w[54] = __byte_perm (w[ 3], w[ 2], selector); + w[53] = __byte_perm (w[ 2], w[ 1], selector); + w[52] = __byte_perm (w[ 1], w[ 0], selector); + w[51] = __byte_perm (w[ 0], 0, selector); + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 52: + w[63] = __byte_perm (w[11], w[10], selector); + w[62] = __byte_perm (w[10], w[ 9], selector); + w[61] = __byte_perm (w[ 9], w[ 8], selector); + w[60] = __byte_perm (w[ 8], w[ 7], selector); + w[59] = __byte_perm (w[ 7], w[ 6], selector); + w[58] = __byte_perm (w[ 6], w[ 5], selector); + w[57] = __byte_perm (w[ 5], w[ 4], selector); + w[56] = __byte_perm (w[ 4], w[ 3], selector); + w[55] = __byte_perm (w[ 3], w[ 2], selector); + w[54] = __byte_perm (w[ 2], w[ 1], selector); + w[53] = __byte_perm (w[ 1], w[ 0], selector); + w[52] = __byte_perm (w[ 0], 0, selector); + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 53: + w[63] = __byte_perm (w[10], w[ 9], selector); + w[62] = __byte_perm (w[ 9], w[ 8], selector); + w[61] = __byte_perm (w[ 8], w[ 7], selector); + w[60] = __byte_perm (w[ 7], w[ 6], selector); + w[59] = __byte_perm (w[ 6], w[ 5], selector); + w[58] = __byte_perm (w[ 5], w[ 4], selector); + w[57] = __byte_perm (w[ 4], w[ 3], selector); + w[56] = __byte_perm (w[ 3], w[ 2], selector); + w[55] = __byte_perm (w[ 2], w[ 1], selector); + w[54] = __byte_perm (w[ 1], w[ 0], selector); + w[53] = __byte_perm (w[ 0], 0, selector); + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 54: + w[63] = __byte_perm (w[ 9], w[ 8], selector); + w[62] = __byte_perm (w[ 8], w[ 7], selector); + w[61] = __byte_perm (w[ 7], w[ 6], selector); + w[60] = __byte_perm (w[ 6], w[ 5], selector); + w[59] = __byte_perm (w[ 5], w[ 4], selector); + w[58] = __byte_perm (w[ 4], w[ 3], selector); + w[57] = __byte_perm (w[ 3], w[ 2], selector); + w[56] = __byte_perm (w[ 2], w[ 1], selector); + w[55] = __byte_perm (w[ 1], w[ 0], selector); + w[54] = __byte_perm (w[ 0], 0, selector); + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 55: + w[63] = __byte_perm (w[ 8], w[ 7], selector); + w[62] = __byte_perm (w[ 7], w[ 6], selector); + w[61] = __byte_perm (w[ 6], w[ 5], selector); + w[60] = __byte_perm (w[ 5], w[ 4], selector); + w[59] = __byte_perm (w[ 4], w[ 3], selector); + w[58] = __byte_perm (w[ 3], w[ 2], selector); + w[57] = __byte_perm (w[ 2], w[ 1], selector); + w[56] = __byte_perm (w[ 1], w[ 0], selector); + w[55] = __byte_perm (w[ 0], 0, selector); + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 56: + w[63] = __byte_perm (w[ 7], w[ 6], selector); + w[62] = __byte_perm (w[ 6], w[ 5], selector); + w[61] = __byte_perm (w[ 5], w[ 4], selector); + w[60] = __byte_perm (w[ 4], w[ 3], selector); + w[59] = __byte_perm (w[ 3], w[ 2], selector); + w[58] = __byte_perm (w[ 2], w[ 1], selector); + w[57] = __byte_perm (w[ 1], w[ 0], selector); + w[56] = __byte_perm (w[ 0], 0, selector); + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 57: + w[63] = __byte_perm (w[ 6], w[ 5], selector); + w[62] = __byte_perm (w[ 5], w[ 4], selector); + w[61] = __byte_perm (w[ 4], w[ 3], selector); + w[60] = __byte_perm (w[ 3], w[ 2], selector); + w[59] = __byte_perm (w[ 2], w[ 1], selector); + w[58] = __byte_perm (w[ 1], w[ 0], selector); + w[57] = __byte_perm (w[ 0], 0, selector); + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 58: + w[63] = __byte_perm (w[ 5], w[ 4], selector); + w[62] = __byte_perm (w[ 4], w[ 3], selector); + w[61] = __byte_perm (w[ 3], w[ 2], selector); + w[60] = __byte_perm (w[ 2], w[ 1], selector); + w[59] = __byte_perm (w[ 1], w[ 0], selector); + w[58] = __byte_perm (w[ 0], 0, selector); + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 59: + w[63] = __byte_perm (w[ 4], w[ 3], selector); + w[62] = __byte_perm (w[ 3], w[ 2], selector); + w[61] = __byte_perm (w[ 2], w[ 1], selector); + w[60] = __byte_perm (w[ 1], w[ 0], selector); + w[59] = __byte_perm (w[ 0], 0, selector); + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 60: + w[63] = __byte_perm (w[ 3], w[ 2], selector); + w[62] = __byte_perm (w[ 2], w[ 1], selector); + w[61] = __byte_perm (w[ 1], w[ 0], selector); + w[60] = __byte_perm (w[ 0], 0, selector); + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 61: + w[63] = __byte_perm (w[ 2], w[ 1], selector); + w[62] = __byte_perm (w[ 1], w[ 0], selector); + w[61] = __byte_perm (w[ 0], 0, selector); + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 62: + w[63] = __byte_perm (w[ 1], w[ 0], selector); + w[62] = __byte_perm (w[ 0], 0, selector); + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 63: + w[63] = __byte_perm (w[ 0], 0, selector); + w[62] = 0; + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + } + #endif +} + void overwrite_at_le (u32x sw[16], const u32x w0, const u32 salt_len) { #if defined cl_amd_media_ops @@ -15045,244 +33029,156 @@ void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], c const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); + w1[1] = swap32_S (w1[1]); + w1[2] = swap32_S (w1[2]); + w1[3] = swap32_S (w1[3]); + w2[0] = swap32_S (w2[0]); + w2[1] = swap32_S (w2[1]); + w2[2] = swap32_S (w2[2]); + w2[3] = swap32_S (w2[3]); + w3[0] = swap32_S (w3[0]); + w3[1] = swap32_S (w3[1]); + w3[2] = swap32_S (w3[2]); + w3[3] = swap32_S (w3[3]); + switch (offset / 4) { case 0: - w3[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w3[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - - if (offset_mod_4 == 0) - { - w0[0] = w0[1]; - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } + w3[3] = amd_bytealign_S (w3[2], w3[3], offset); + w3[2] = amd_bytealign_S (w3[1], w3[2], offset); + w3[1] = amd_bytealign_S (w3[0], w3[1], offset); + w3[0] = amd_bytealign_S (w2[3], w3[0], offset); + w2[3] = amd_bytealign_S (w2[2], w2[3], offset); + w2[2] = amd_bytealign_S (w2[1], w2[2], offset); + w2[1] = amd_bytealign_S (w2[0], w2[1], offset); + w2[0] = amd_bytealign_S (w1[3], w2[0], offset); + w1[3] = amd_bytealign_S (w1[2], w1[3], offset); + w1[2] = amd_bytealign_S (w1[1], w1[2], offset); + w1[1] = amd_bytealign_S (w1[0], w1[1], offset); + w1[0] = amd_bytealign_S (w0[3], w1[0], offset); + w0[3] = amd_bytealign_S (w0[2], w0[3], offset); + w0[2] = amd_bytealign_S (w0[1], w0[2], offset); + w0[1] = amd_bytealign_S (w0[0], w0[1], offset); + w0[0] = amd_bytealign_S ( 0, w0[0], offset); break; case 1: - w3[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w3[1], w3[2], offset); + w3[2] = amd_bytealign_S (w3[0], w3[1], offset); + w3[1] = amd_bytealign_S (w2[3], w3[0], offset); + w3[0] = amd_bytealign_S (w2[2], w2[3], offset); + w2[3] = amd_bytealign_S (w2[1], w2[2], offset); + w2[2] = amd_bytealign_S (w2[0], w2[1], offset); + w2[1] = amd_bytealign_S (w1[3], w2[0], offset); + w2[0] = amd_bytealign_S (w1[2], w1[3], offset); + w1[3] = amd_bytealign_S (w1[1], w1[2], offset); + w1[2] = amd_bytealign_S (w1[0], w1[1], offset); + w1[1] = amd_bytealign_S (w0[3], w1[0], offset); + w1[0] = amd_bytealign_S (w0[2], w0[3], offset); + w0[3] = amd_bytealign_S (w0[1], w0[2], offset); + w0[2] = amd_bytealign_S (w0[0], w0[1], offset); + w0[1] = amd_bytealign_S ( 0, w0[0], offset); w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 2: - w3[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w3[0], w3[1], offset); + w3[2] = amd_bytealign_S (w2[3], w3[0], offset); + w3[1] = amd_bytealign_S (w2[2], w2[3], offset); + w3[0] = amd_bytealign_S (w2[1], w2[2], offset); + w2[3] = amd_bytealign_S (w2[0], w2[1], offset); + w2[2] = amd_bytealign_S (w1[3], w2[0], offset); + w2[1] = amd_bytealign_S (w1[2], w1[3], offset); + w2[0] = amd_bytealign_S (w1[1], w1[2], offset); + w1[3] = amd_bytealign_S (w1[0], w1[1], offset); + w1[2] = amd_bytealign_S (w0[3], w1[0], offset); + w1[1] = amd_bytealign_S (w0[2], w0[3], offset); + w1[0] = amd_bytealign_S (w0[1], w0[2], offset); + w0[3] = amd_bytealign_S (w0[0], w0[1], offset); + w0[2] = amd_bytealign_S ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 3: - w3[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w2[3], w3[0], offset); + w3[2] = amd_bytealign_S (w2[2], w2[3], offset); + w3[1] = amd_bytealign_S (w2[1], w2[2], offset); + w3[0] = amd_bytealign_S (w2[0], w2[1], offset); + w2[3] = amd_bytealign_S (w1[3], w2[0], offset); + w2[2] = amd_bytealign_S (w1[2], w1[3], offset); + w2[1] = amd_bytealign_S (w1[1], w1[2], offset); + w2[0] = amd_bytealign_S (w1[0], w1[1], offset); + w1[3] = amd_bytealign_S (w0[3], w1[0], offset); + w1[2] = amd_bytealign_S (w0[2], w0[3], offset); + w1[1] = amd_bytealign_S (w0[1], w0[2], offset); + w1[0] = amd_bytealign_S (w0[0], w0[1], offset); + w0[3] = amd_bytealign_S ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 4: - w3[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w2[2], w2[3], offset); + w3[2] = amd_bytealign_S (w2[1], w2[2], offset); + w3[1] = amd_bytealign_S (w2[0], w2[1], offset); + w3[0] = amd_bytealign_S (w1[3], w2[0], offset); + w2[3] = amd_bytealign_S (w1[2], w1[3], offset); + w2[2] = amd_bytealign_S (w1[1], w1[2], offset); + w2[1] = amd_bytealign_S (w1[0], w1[1], offset); + w2[0] = amd_bytealign_S (w0[3], w1[0], offset); + w1[3] = amd_bytealign_S (w0[2], w0[3], offset); + w1[2] = amd_bytealign_S (w0[1], w0[2], offset); + w1[1] = amd_bytealign_S (w0[0], w0[1], offset); + w1[0] = amd_bytealign_S ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 5: - w3[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w2[1], w2[2], offset); + w3[2] = amd_bytealign_S (w2[0], w2[1], offset); + w3[1] = amd_bytealign_S (w1[3], w2[0], offset); + w3[0] = amd_bytealign_S (w1[2], w1[3], offset); + w2[3] = amd_bytealign_S (w1[1], w1[2], offset); + w2[2] = amd_bytealign_S (w1[0], w1[1], offset); + w2[1] = amd_bytealign_S (w0[3], w1[0], offset); + w2[0] = amd_bytealign_S (w0[2], w0[3], offset); + w1[3] = amd_bytealign_S (w0[1], w0[2], offset); + w1[2] = amd_bytealign_S (w0[0], w0[1], offset); + w1[1] = amd_bytealign_S ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 6: - w3[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w2[0], w2[1], offset); + w3[2] = amd_bytealign_S (w1[3], w2[0], offset); + w3[1] = amd_bytealign_S (w1[2], w1[3], offset); + w3[0] = amd_bytealign_S (w1[1], w1[2], offset); + w2[3] = amd_bytealign_S (w1[0], w1[1], offset); + w2[2] = amd_bytealign_S (w0[3], w1[0], offset); + w2[1] = amd_bytealign_S (w0[2], w0[3], offset); + w2[0] = amd_bytealign_S (w0[1], w0[2], offset); + w1[3] = amd_bytealign_S (w0[0], w0[1], offset); + w1[2] = amd_bytealign_S ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -15290,32 +33186,18 @@ void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], c w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 7: - w3[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w1[3], w2[0], offset); + w3[2] = amd_bytealign_S (w1[2], w1[3], offset); + w3[1] = amd_bytealign_S (w1[1], w1[2], offset); + w3[0] = amd_bytealign_S (w1[0], w1[1], offset); + w2[3] = amd_bytealign_S (w0[3], w1[0], offset); + w2[2] = amd_bytealign_S (w0[2], w0[3], offset); + w2[1] = amd_bytealign_S (w0[1], w0[2], offset); + w2[0] = amd_bytealign_S (w0[0], w0[1], offset); + w1[3] = amd_bytealign_S ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -15324,30 +33206,17 @@ void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], c w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 8: - w3[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w1[2], w1[3], offset); + w3[2] = amd_bytealign_S (w1[1], w1[2], offset); + w3[1] = amd_bytealign_S (w1[0], w1[1], offset); + w3[0] = amd_bytealign_S (w0[3], w1[0], offset); + w2[3] = amd_bytealign_S (w0[2], w0[3], offset); + w2[2] = amd_bytealign_S (w0[1], w0[2], offset); + w2[1] = amd_bytealign_S (w0[0], w0[1], offset); + w2[0] = amd_bytealign_S ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -15357,28 +33226,16 @@ void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], c w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 9: - w3[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w1[1], w1[2], offset); + w3[2] = amd_bytealign_S (w1[0], w1[1], offset); + w3[1] = amd_bytealign_S (w0[3], w1[0], offset); + w3[0] = amd_bytealign_S (w0[2], w0[3], offset); + w2[3] = amd_bytealign_S (w0[1], w0[2], offset); + w2[2] = amd_bytealign_S (w0[0], w0[1], offset); + w2[1] = amd_bytealign_S ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -15389,26 +33246,15 @@ void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], c w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 10: - w3[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w1[0], w1[1], offset); + w3[2] = amd_bytealign_S (w0[3], w1[0], offset); + w3[1] = amd_bytealign_S (w0[2], w0[3], offset); + w3[0] = amd_bytealign_S (w0[1], w0[2], offset); + w2[3] = amd_bytealign_S (w0[0], w0[1], offset); + w2[2] = amd_bytealign_S ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -15420,24 +33266,14 @@ void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], c w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 11: - w3[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w0[3], w1[0], offset); + w3[2] = amd_bytealign_S (w0[2], w0[3], offset); + w3[1] = amd_bytealign_S (w0[1], w0[2], offset); + w3[0] = amd_bytealign_S (w0[0], w0[1], offset); + w2[3] = amd_bytealign_S ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -15450,22 +33286,13 @@ void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], c w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 12: - w3[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w0[2], w0[3], offset); + w3[2] = amd_bytealign_S (w0[1], w0[2], offset); + w3[1] = amd_bytealign_S (w0[0], w0[1], offset); + w3[0] = amd_bytealign_S ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -15479,20 +33306,12 @@ void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], c w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 13: - w3[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w0[1], w0[2], offset); + w3[2] = amd_bytealign_S (w0[0], w0[1], offset); + w3[1] = amd_bytealign_S ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -15507,18 +33326,11 @@ void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], c w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 14: - w3[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S (w0[0], w0[1], offset); + w3[2] = amd_bytealign_S ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -15534,16 +33346,10 @@ void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], c w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[2] = w3[3]; - w3[3] = 0; - } - break; case 15: - w3[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[3] = amd_bytealign_S ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -15560,13 +33366,25 @@ void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], c w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[3] = 0; - } - break; } + + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); + w1[1] = swap32_S (w1[1]); + w1[2] = swap32_S (w1[2]); + w1[3] = swap32_S (w1[3]); + w2[0] = swap32_S (w2[0]); + w2[1] = swap32_S (w2[1]); + w2[2] = swap32_S (w2[2]); + w2[3] = swap32_S (w2[3]); + w3[0] = swap32_S (w3[0]); + w3[1] = swap32_S (w3[1]); + w3[2] = swap32_S (w3[2]); + w3[3] = swap32_S (w3[3]); #endif #ifdef IS_NV @@ -15903,6 +33721,519 @@ void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3 const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); + w1[1] = swap32_S (w1[1]); + w1[2] = swap32_S (w1[2]); + w1[3] = swap32_S (w1[3]); + w2[0] = swap32_S (w2[0]); + w2[1] = swap32_S (w2[1]); + w2[2] = swap32_S (w2[2]); + w2[3] = swap32_S (w2[3]); + w3[0] = swap32_S (w3[0]); + w3[1] = swap32_S (w3[1]); + w3[2] = swap32_S (w3[2]); + w3[3] = swap32_S (w3[3]); + + switch (offset / 4) + { + case 0: + c0[0] = amd_bytealign_S (w3[3], 0, offset); + w3[3] = amd_bytealign_S (w3[2], w3[3], offset); + w3[2] = amd_bytealign_S (w3[1], w3[2], offset); + w3[1] = amd_bytealign_S (w3[0], w3[1], offset); + w3[0] = amd_bytealign_S (w2[3], w3[0], offset); + w2[3] = amd_bytealign_S (w2[2], w2[3], offset); + w2[2] = amd_bytealign_S (w2[1], w2[2], offset); + w2[1] = amd_bytealign_S (w2[0], w2[1], offset); + w2[0] = amd_bytealign_S (w1[3], w2[0], offset); + w1[3] = amd_bytealign_S (w1[2], w1[3], offset); + w1[2] = amd_bytealign_S (w1[1], w1[2], offset); + w1[1] = amd_bytealign_S (w1[0], w1[1], offset); + w1[0] = amd_bytealign_S (w0[3], w1[0], offset); + w0[3] = amd_bytealign_S (w0[2], w0[3], offset); + w0[2] = amd_bytealign_S (w0[1], w0[2], offset); + w0[1] = amd_bytealign_S (w0[0], w0[1], offset); + w0[0] = amd_bytealign_S ( 0, w0[0], offset); + + break; + + case 1: + c0[1] = amd_bytealign_S (w3[3], 0, offset); + c0[0] = amd_bytealign_S (w3[2], w3[3], offset); + w3[3] = amd_bytealign_S (w3[1], w3[2], offset); + w3[2] = amd_bytealign_S (w3[0], w3[1], offset); + w3[1] = amd_bytealign_S (w2[3], w3[0], offset); + w3[0] = amd_bytealign_S (w2[2], w2[3], offset); + w2[3] = amd_bytealign_S (w2[1], w2[2], offset); + w2[2] = amd_bytealign_S (w2[0], w2[1], offset); + w2[1] = amd_bytealign_S (w1[3], w2[0], offset); + w2[0] = amd_bytealign_S (w1[2], w1[3], offset); + w1[3] = amd_bytealign_S (w1[1], w1[2], offset); + w1[2] = amd_bytealign_S (w1[0], w1[1], offset); + w1[1] = amd_bytealign_S (w0[3], w1[0], offset); + w1[0] = amd_bytealign_S (w0[2], w0[3], offset); + w0[3] = amd_bytealign_S (w0[1], w0[2], offset); + w0[2] = amd_bytealign_S (w0[0], w0[1], offset); + w0[1] = amd_bytealign_S ( 0, w0[0], offset); + w0[0] = 0; + + break; + + case 2: + c0[2] = amd_bytealign_S (w3[3], 0, offset); + c0[1] = amd_bytealign_S (w3[2], w3[3], offset); + c0[0] = amd_bytealign_S (w3[1], w3[2], offset); + w3[3] = amd_bytealign_S (w3[0], w3[1], offset); + w3[2] = amd_bytealign_S (w2[3], w3[0], offset); + w3[1] = amd_bytealign_S (w2[2], w2[3], offset); + w3[0] = amd_bytealign_S (w2[1], w2[2], offset); + w2[3] = amd_bytealign_S (w2[0], w2[1], offset); + w2[2] = amd_bytealign_S (w1[3], w2[0], offset); + w2[1] = amd_bytealign_S (w1[2], w1[3], offset); + w2[0] = amd_bytealign_S (w1[1], w1[2], offset); + w1[3] = amd_bytealign_S (w1[0], w1[1], offset); + w1[2] = amd_bytealign_S (w0[3], w1[0], offset); + w1[1] = amd_bytealign_S (w0[2], w0[3], offset); + w1[0] = amd_bytealign_S (w0[1], w0[2], offset); + w0[3] = amd_bytealign_S (w0[0], w0[1], offset); + w0[2] = amd_bytealign_S ( 0, w0[0], offset); + w0[1] = 0; + w0[0] = 0; + + break; + + case 3: + c0[3] = amd_bytealign_S (w3[3], 0, offset); + c0[2] = amd_bytealign_S (w3[2], w3[3], offset); + c0[1] = amd_bytealign_S (w3[1], w3[2], offset); + c0[0] = amd_bytealign_S (w3[0], w3[1], offset); + w3[3] = amd_bytealign_S (w2[3], w3[0], offset); + w3[2] = amd_bytealign_S (w2[2], w2[3], offset); + w3[1] = amd_bytealign_S (w2[1], w2[2], offset); + w3[0] = amd_bytealign_S (w2[0], w2[1], offset); + w2[3] = amd_bytealign_S (w1[3], w2[0], offset); + w2[2] = amd_bytealign_S (w1[2], w1[3], offset); + w2[1] = amd_bytealign_S (w1[1], w1[2], offset); + w2[0] = amd_bytealign_S (w1[0], w1[1], offset); + w1[3] = amd_bytealign_S (w0[3], w1[0], offset); + w1[2] = amd_bytealign_S (w0[2], w0[3], offset); + w1[1] = amd_bytealign_S (w0[1], w0[2], offset); + w1[0] = amd_bytealign_S (w0[0], w0[1], offset); + w0[3] = amd_bytealign_S ( 0, w0[0], offset); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 4: + c1[0] = amd_bytealign_S (w3[3], 0, offset); + c0[3] = amd_bytealign_S (w3[2], w3[3], offset); + c0[2] = amd_bytealign_S (w3[1], w3[2], offset); + c0[1] = amd_bytealign_S (w3[0], w3[1], offset); + c0[0] = amd_bytealign_S (w2[3], w3[0], offset); + w3[3] = amd_bytealign_S (w2[2], w2[3], offset); + w3[2] = amd_bytealign_S (w2[1], w2[2], offset); + w3[1] = amd_bytealign_S (w2[0], w2[1], offset); + w3[0] = amd_bytealign_S (w1[3], w2[0], offset); + w2[3] = amd_bytealign_S (w1[2], w1[3], offset); + w2[2] = amd_bytealign_S (w1[1], w1[2], offset); + w2[1] = amd_bytealign_S (w1[0], w1[1], offset); + w2[0] = amd_bytealign_S (w0[3], w1[0], offset); + w1[3] = amd_bytealign_S (w0[2], w0[3], offset); + w1[2] = amd_bytealign_S (w0[1], w0[2], offset); + w1[1] = amd_bytealign_S (w0[0], w0[1], offset); + w1[0] = amd_bytealign_S ( 0, w0[0], offset); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 5: + c1[1] = amd_bytealign_S (w3[3], 0, offset); + c1[0] = amd_bytealign_S (w3[2], w3[3], offset); + c0[3] = amd_bytealign_S (w3[1], w3[2], offset); + c0[2] = amd_bytealign_S (w3[0], w3[1], offset); + c0[1] = amd_bytealign_S (w2[3], w3[0], offset); + c0[0] = amd_bytealign_S (w2[2], w2[3], offset); + w3[3] = amd_bytealign_S (w2[1], w2[2], offset); + w3[2] = amd_bytealign_S (w2[0], w2[1], offset); + w3[1] = amd_bytealign_S (w1[3], w2[0], offset); + w3[0] = amd_bytealign_S (w1[2], w1[3], offset); + w2[3] = amd_bytealign_S (w1[1], w1[2], offset); + w2[2] = amd_bytealign_S (w1[0], w1[1], offset); + w2[1] = amd_bytealign_S (w0[3], w1[0], offset); + w2[0] = amd_bytealign_S (w0[2], w0[3], offset); + w1[3] = amd_bytealign_S (w0[1], w0[2], offset); + w1[2] = amd_bytealign_S (w0[0], w0[1], offset); + w1[1] = amd_bytealign_S ( 0, w0[0], offset); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 6: + c1[2] = amd_bytealign_S (w3[3], 0, offset); + c1[1] = amd_bytealign_S (w3[2], w3[3], offset); + c1[0] = amd_bytealign_S (w3[1], w3[2], offset); + c0[3] = amd_bytealign_S (w3[0], w3[1], offset); + c0[2] = amd_bytealign_S (w2[3], w3[0], offset); + c0[1] = amd_bytealign_S (w2[2], w2[3], offset); + c0[0] = amd_bytealign_S (w2[1], w2[2], offset); + w3[3] = amd_bytealign_S (w2[0], w2[1], offset); + w3[2] = amd_bytealign_S (w1[3], w2[0], offset); + w3[1] = amd_bytealign_S (w1[2], w1[3], offset); + w3[0] = amd_bytealign_S (w1[1], w1[2], offset); + w2[3] = amd_bytealign_S (w1[0], w1[1], offset); + w2[2] = amd_bytealign_S (w0[3], w1[0], offset); + w2[1] = amd_bytealign_S (w0[2], w0[3], offset); + w2[0] = amd_bytealign_S (w0[1], w0[2], offset); + w1[3] = amd_bytealign_S (w0[0], w0[1], offset); + w1[2] = amd_bytealign_S ( 0, w0[0], offset); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 7: + c1[3] = amd_bytealign_S (w3[3], 0, offset); + c1[2] = amd_bytealign_S (w3[2], w3[3], offset); + c1[1] = amd_bytealign_S (w3[1], w3[2], offset); + c1[0] = amd_bytealign_S (w3[0], w3[1], offset); + c0[3] = amd_bytealign_S (w2[3], w3[0], offset); + c0[2] = amd_bytealign_S (w2[2], w2[3], offset); + c0[1] = amd_bytealign_S (w2[1], w2[2], offset); + c0[0] = amd_bytealign_S (w2[0], w2[1], offset); + w3[3] = amd_bytealign_S (w1[3], w2[0], offset); + w3[2] = amd_bytealign_S (w1[2], w1[3], offset); + w3[1] = amd_bytealign_S (w1[1], w1[2], offset); + w3[0] = amd_bytealign_S (w1[0], w1[1], offset); + w2[3] = amd_bytealign_S (w0[3], w1[0], offset); + w2[2] = amd_bytealign_S (w0[2], w0[3], offset); + w2[1] = amd_bytealign_S (w0[1], w0[2], offset); + w2[0] = amd_bytealign_S (w0[0], w0[1], offset); + w1[3] = amd_bytealign_S ( 0, w0[0], offset); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 8: + c2[0] = amd_bytealign_S (w3[3], 0, offset); + c1[3] = amd_bytealign_S (w3[2], w3[3], offset); + c1[2] = amd_bytealign_S (w3[1], w3[2], offset); + c1[1] = amd_bytealign_S (w3[0], w3[1], offset); + c1[0] = amd_bytealign_S (w2[3], w3[0], offset); + c0[3] = amd_bytealign_S (w2[2], w2[3], offset); + c0[2] = amd_bytealign_S (w2[1], w2[2], offset); + c0[1] = amd_bytealign_S (w2[0], w2[1], offset); + c0[0] = amd_bytealign_S (w1[3], w2[0], offset); + w3[3] = amd_bytealign_S (w1[2], w1[3], offset); + w3[2] = amd_bytealign_S (w1[1], w1[2], offset); + w3[1] = amd_bytealign_S (w1[0], w1[1], offset); + w3[0] = amd_bytealign_S (w0[3], w1[0], offset); + w2[3] = amd_bytealign_S (w0[2], w0[3], offset); + w2[2] = amd_bytealign_S (w0[1], w0[2], offset); + w2[1] = amd_bytealign_S (w0[0], w0[1], offset); + w2[0] = amd_bytealign_S ( 0, w0[0], offset); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 9: + c2[1] = amd_bytealign_S (w3[3], 0, offset); + c2[0] = amd_bytealign_S (w3[2], w3[3], offset); + c1[3] = amd_bytealign_S (w3[1], w3[2], offset); + c1[2] = amd_bytealign_S (w3[0], w3[1], offset); + c1[1] = amd_bytealign_S (w2[3], w3[0], offset); + c1[0] = amd_bytealign_S (w2[2], w2[3], offset); + c0[3] = amd_bytealign_S (w2[1], w2[2], offset); + c0[2] = amd_bytealign_S (w2[0], w2[1], offset); + c0[1] = amd_bytealign_S (w1[3], w2[0], offset); + c0[0] = amd_bytealign_S (w1[2], w1[3], offset); + w3[3] = amd_bytealign_S (w1[1], w1[2], offset); + w3[2] = amd_bytealign_S (w1[0], w1[1], offset); + w3[1] = amd_bytealign_S (w0[3], w1[0], offset); + w3[0] = amd_bytealign_S (w0[2], w0[3], offset); + w2[3] = amd_bytealign_S (w0[1], w0[2], offset); + w2[2] = amd_bytealign_S (w0[0], w0[1], offset); + w2[1] = amd_bytealign_S ( 0, w0[0], offset); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 10: + c2[2] = amd_bytealign_S (w3[3], 0, offset); + c2[1] = amd_bytealign_S (w3[2], w3[3], offset); + c2[0] = amd_bytealign_S (w3[1], w3[2], offset); + c1[3] = amd_bytealign_S (w3[0], w3[1], offset); + c1[2] = amd_bytealign_S (w2[3], w3[0], offset); + c1[1] = amd_bytealign_S (w2[2], w2[3], offset); + c1[0] = amd_bytealign_S (w2[1], w2[2], offset); + c0[3] = amd_bytealign_S (w2[0], w2[1], offset); + c0[2] = amd_bytealign_S (w1[3], w2[0], offset); + c0[1] = amd_bytealign_S (w1[2], w1[3], offset); + c0[0] = amd_bytealign_S (w1[1], w1[2], offset); + w3[3] = amd_bytealign_S (w1[0], w1[1], offset); + w3[2] = amd_bytealign_S (w0[3], w1[0], offset); + w3[1] = amd_bytealign_S (w0[2], w0[3], offset); + w3[0] = amd_bytealign_S (w0[1], w0[2], offset); + w2[3] = amd_bytealign_S (w0[0], w0[1], offset); + w2[2] = amd_bytealign_S ( 0, w0[0], offset); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 11: + c2[3] = amd_bytealign_S (w3[3], 0, offset); + c2[2] = amd_bytealign_S (w3[2], w3[3], offset); + c2[1] = amd_bytealign_S (w3[1], w3[2], offset); + c2[0] = amd_bytealign_S (w3[0], w3[1], offset); + c1[3] = amd_bytealign_S (w2[3], w3[0], offset); + c1[2] = amd_bytealign_S (w2[2], w2[3], offset); + c1[1] = amd_bytealign_S (w2[1], w2[2], offset); + c1[0] = amd_bytealign_S (w2[0], w2[1], offset); + c0[3] = amd_bytealign_S (w1[3], w2[0], offset); + c0[2] = amd_bytealign_S (w1[2], w1[3], offset); + c0[1] = amd_bytealign_S (w1[1], w1[2], offset); + c0[0] = amd_bytealign_S (w1[0], w1[1], offset); + w3[3] = amd_bytealign_S (w0[3], w1[0], offset); + w3[2] = amd_bytealign_S (w0[2], w0[3], offset); + w3[1] = amd_bytealign_S (w0[1], w0[2], offset); + w3[0] = amd_bytealign_S (w0[0], w0[1], offset); + w2[3] = amd_bytealign_S ( 0, w0[0], offset); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 12: + c3[0] = amd_bytealign_S (w3[3], 0, offset); + c2[3] = amd_bytealign_S (w3[2], w3[3], offset); + c2[2] = amd_bytealign_S (w3[1], w3[2], offset); + c2[1] = amd_bytealign_S (w3[0], w3[1], offset); + c2[0] = amd_bytealign_S (w2[3], w3[0], offset); + c1[3] = amd_bytealign_S (w2[2], w2[3], offset); + c1[2] = amd_bytealign_S (w2[1], w2[2], offset); + c1[1] = amd_bytealign_S (w2[0], w2[1], offset); + c1[0] = amd_bytealign_S (w1[3], w2[0], offset); + c0[3] = amd_bytealign_S (w1[2], w1[3], offset); + c0[2] = amd_bytealign_S (w1[1], w1[2], offset); + c0[1] = amd_bytealign_S (w1[0], w1[1], offset); + c0[0] = amd_bytealign_S (w0[3], w1[0], offset); + w3[3] = amd_bytealign_S (w0[2], w0[3], offset); + w3[2] = amd_bytealign_S (w0[1], w0[2], offset); + w3[1] = amd_bytealign_S (w0[0], w0[1], offset); + w3[0] = amd_bytealign_S ( 0, w0[0], offset); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 13: + c3[1] = amd_bytealign_S (w3[3], 0, offset); + c3[0] = amd_bytealign_S (w3[2], w3[3], offset); + c2[3] = amd_bytealign_S (w3[1], w3[2], offset); + c2[2] = amd_bytealign_S (w3[0], w3[1], offset); + c2[1] = amd_bytealign_S (w2[3], w3[0], offset); + c2[0] = amd_bytealign_S (w2[2], w2[3], offset); + c1[3] = amd_bytealign_S (w2[1], w2[2], offset); + c1[2] = amd_bytealign_S (w2[0], w2[1], offset); + c1[1] = amd_bytealign_S (w1[3], w2[0], offset); + c1[0] = amd_bytealign_S (w1[2], w1[3], offset); + c0[3] = amd_bytealign_S (w1[1], w1[2], offset); + c0[2] = amd_bytealign_S (w1[0], w1[1], offset); + c0[1] = amd_bytealign_S (w0[3], w1[0], offset); + c0[0] = amd_bytealign_S (w0[2], w0[3], offset); + w3[3] = amd_bytealign_S (w0[1], w0[2], offset); + w3[2] = amd_bytealign_S (w0[0], w0[1], offset); + w3[1] = amd_bytealign_S ( 0, w0[0], offset); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 14: + c3[2] = amd_bytealign_S (w3[3], 0, offset); + c3[1] = amd_bytealign_S (w3[2], w3[3], offset); + c3[0] = amd_bytealign_S (w3[1], w3[2], offset); + c2[3] = amd_bytealign_S (w3[0], w3[1], offset); + c2[2] = amd_bytealign_S (w2[3], w3[0], offset); + c2[1] = amd_bytealign_S (w2[2], w2[3], offset); + c2[0] = amd_bytealign_S (w2[1], w2[2], offset); + c1[3] = amd_bytealign_S (w2[0], w2[1], offset); + c1[2] = amd_bytealign_S (w1[3], w2[0], offset); + c1[1] = amd_bytealign_S (w1[2], w1[3], offset); + c1[0] = amd_bytealign_S (w1[1], w1[2], offset); + c0[3] = amd_bytealign_S (w1[0], w1[1], offset); + c0[2] = amd_bytealign_S (w0[3], w1[0], offset); + c0[1] = amd_bytealign_S (w0[2], w0[3], offset); + c0[0] = amd_bytealign_S (w0[1], w0[2], offset); + w3[3] = amd_bytealign_S (w0[0], w0[1], offset); + w3[2] = amd_bytealign_S ( 0, w0[0], offset); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 15: + c3[3] = amd_bytealign_S (w3[3], 0, offset); + c3[2] = amd_bytealign_S (w3[2], w3[3], offset); + c3[1] = amd_bytealign_S (w3[1], w3[2], offset); + c3[0] = amd_bytealign_S (w3[0], w3[1], offset); + c2[3] = amd_bytealign_S (w2[3], w3[0], offset); + c2[2] = amd_bytealign_S (w2[2], w2[3], offset); + c2[1] = amd_bytealign_S (w2[1], w2[2], offset); + c2[0] = amd_bytealign_S (w2[0], w2[1], offset); + c1[3] = amd_bytealign_S (w1[3], w2[0], offset); + c1[2] = amd_bytealign_S (w1[2], w1[3], offset); + c1[1] = amd_bytealign_S (w1[1], w1[2], offset); + c1[0] = amd_bytealign_S (w1[0], w1[1], offset); + c0[3] = amd_bytealign_S (w0[3], w1[0], offset); + c0[2] = amd_bytealign_S (w0[2], w0[3], offset); + c0[1] = amd_bytealign_S (w0[1], w0[2], offset); + c0[0] = amd_bytealign_S (w0[0], w0[1], offset); + w3[3] = amd_bytealign_S ( 0, w0[0], offset); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + } + + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); + w1[1] = swap32_S (w1[1]); + w1[2] = swap32_S (w1[2]); + w1[3] = swap32_S (w1[3]); + w2[0] = swap32_S (w2[0]); + w2[1] = swap32_S (w2[1]); + w2[2] = swap32_S (w2[2]); + w2[3] = swap32_S (w2[3]); + w3[0] = swap32_S (w3[0]); + w3[1] = swap32_S (w3[1]); + w3[2] = swap32_S (w3[2]); + w3[3] = swap32_S (w3[3]); + c0[0] = swap32_S (c0[0]); + c0[1] = swap32_S (c0[1]); + c0[2] = swap32_S (c0[2]); + c0[3] = swap32_S (c0[3]); + c1[0] = swap32_S (c1[0]); + c1[1] = swap32_S (c1[1]); + c1[2] = swap32_S (c1[2]); + c1[3] = swap32_S (c1[3]); + c2[0] = swap32_S (c2[0]); + c2[1] = swap32_S (c2[1]); + c2[2] = swap32_S (c2[2]); + c2[3] = swap32_S (c2[3]); + c3[0] = swap32_S (c3[0]); + c3[1] = swap32_S (c3[1]); + c3[2] = swap32_S (c3[2]); + c3[3] = swap32_S (c3[3]); + #endif + + #ifdef IS_NV + // todo switch (offset / 4) { case 0: @@ -16697,6 +35028,7 @@ void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3 break; } + #endif } void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) @@ -18288,452 +36620,284 @@ void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4 const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); + w1[1] = swap32_S (w1[1]); + w1[2] = swap32_S (w1[2]); + w1[3] = swap32_S (w1[3]); + w2[0] = swap32_S (w2[0]); + w2[1] = swap32_S (w2[1]); + w2[2] = swap32_S (w2[2]); + w2[3] = swap32_S (w2[3]); + w3[0] = swap32_S (w3[0]); + w3[1] = swap32_S (w3[1]); + w3[2] = swap32_S (w3[2]); + w3[3] = swap32_S (w3[3]); + w4[0] = swap32_S (w4[0]); + w4[1] = swap32_S (w4[1]); + w4[2] = swap32_S (w4[2]); + w4[3] = swap32_S (w4[3]); + w5[0] = swap32_S (w5[0]); + w5[1] = swap32_S (w5[1]); + w5[2] = swap32_S (w5[2]); + w5[3] = swap32_S (w5[3]); + w6[0] = swap32_S (w6[0]); + w6[1] = swap32_S (w6[1]); + w6[2] = swap32_S (w6[2]); + w6[3] = swap32_S (w6[3]); + w7[0] = swap32_S (w7[0]); + w7[1] = swap32_S (w7[1]); + w7[2] = swap32_S (w7[2]); + w7[3] = swap32_S (w7[3]); + switch (offset / 4) { - case 0: - w7[3] = amd_bytealign_S (w7[3], w7[2], offset_minus_4); - w7[2] = amd_bytealign_S (w7[2], w7[1], offset_minus_4); - w7[1] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); - w7[0] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); - w6[3] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w6[2] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w6[1] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w6[0] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w5[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w5[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w5[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w5[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w4[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w4[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w4[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w4[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w3[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w3[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - - if (offset_mod_4 == 0) - { - w0[0] = w0[1]; - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } + case 0: + w7[3] = amd_bytealign_S (w7[2], w7[3], offset); + w7[2] = amd_bytealign_S (w7[1], w7[2], offset); + w7[1] = amd_bytealign_S (w7[0], w7[1], offset); + w7[0] = amd_bytealign_S (w6[3], w7[0], offset); + w6[3] = amd_bytealign_S (w6[2], w6[3], offset); + w6[2] = amd_bytealign_S (w6[1], w6[2], offset); + w6[1] = amd_bytealign_S (w6[0], w6[1], offset); + w6[0] = amd_bytealign_S (w5[3], w6[0], offset); + w5[3] = amd_bytealign_S (w5[2], w5[3], offset); + w5[2] = amd_bytealign_S (w5[1], w5[2], offset); + w5[1] = amd_bytealign_S (w5[0], w5[1], offset); + w5[0] = amd_bytealign_S (w4[3], w5[0], offset); + w4[3] = amd_bytealign_S (w4[2], w4[3], offset); + w4[2] = amd_bytealign_S (w4[1], w4[2], offset); + w4[1] = amd_bytealign_S (w4[0], w4[1], offset); + w4[0] = amd_bytealign_S (w3[3], w4[0], offset); + w3[3] = amd_bytealign_S (w3[2], w3[3], offset); + w3[2] = amd_bytealign_S (w3[1], w3[2], offset); + w3[1] = amd_bytealign_S (w3[0], w3[1], offset); + w3[0] = amd_bytealign_S (w2[3], w3[0], offset); + w2[3] = amd_bytealign_S (w2[2], w2[3], offset); + w2[2] = amd_bytealign_S (w2[1], w2[2], offset); + w2[1] = amd_bytealign_S (w2[0], w2[1], offset); + w2[0] = amd_bytealign_S (w1[3], w2[0], offset); + w1[3] = amd_bytealign_S (w1[2], w1[3], offset); + w1[2] = amd_bytealign_S (w1[1], w1[2], offset); + w1[1] = amd_bytealign_S (w1[0], w1[1], offset); + w1[0] = amd_bytealign_S (w0[3], w1[0], offset); + w0[3] = amd_bytealign_S (w0[2], w0[3], offset); + w0[2] = amd_bytealign_S (w0[1], w0[2], offset); + w0[1] = amd_bytealign_S (w0[0], w0[1], offset); + w0[0] = amd_bytealign_S ( 0, w0[0], offset); break; - case 1: - w7[3] = amd_bytealign_S (w7[2], w7[1], offset_minus_4); - w7[2] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); - w7[1] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); - w7[0] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w6[3] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w6[2] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w6[1] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w6[0] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w5[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w5[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w5[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w5[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w4[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w4[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w4[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w4[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w3[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 1: + w7[3] = amd_bytealign_S (w7[1], w7[2], offset); + w7[2] = amd_bytealign_S (w7[0], w7[1], offset); + w7[1] = amd_bytealign_S (w6[3], w7[0], offset); + w7[0] = amd_bytealign_S (w6[2], w6[3], offset); + w6[3] = amd_bytealign_S (w6[1], w6[2], offset); + w6[2] = amd_bytealign_S (w6[0], w6[1], offset); + w6[1] = amd_bytealign_S (w5[3], w6[0], offset); + w6[0] = amd_bytealign_S (w5[2], w5[3], offset); + w5[3] = amd_bytealign_S (w5[1], w5[2], offset); + w5[2] = amd_bytealign_S (w5[0], w5[1], offset); + w5[1] = amd_bytealign_S (w4[3], w5[0], offset); + w5[0] = amd_bytealign_S (w4[2], w4[3], offset); + w4[3] = amd_bytealign_S (w4[1], w4[2], offset); + w4[2] = amd_bytealign_S (w4[0], w4[1], offset); + w4[1] = amd_bytealign_S (w3[3], w4[0], offset); + w4[0] = amd_bytealign_S (w3[2], w3[3], offset); + w3[3] = amd_bytealign_S (w3[1], w3[2], offset); + w3[2] = amd_bytealign_S (w3[0], w3[1], offset); + w3[1] = amd_bytealign_S (w2[3], w3[0], offset); + w3[0] = amd_bytealign_S (w2[2], w2[3], offset); + w2[3] = amd_bytealign_S (w2[1], w2[2], offset); + w2[2] = amd_bytealign_S (w2[0], w2[1], offset); + w2[1] = amd_bytealign_S (w1[3], w2[0], offset); + w2[0] = amd_bytealign_S (w1[2], w1[3], offset); + w1[3] = amd_bytealign_S (w1[1], w1[2], offset); + w1[2] = amd_bytealign_S (w1[0], w1[1], offset); + w1[1] = amd_bytealign_S (w0[3], w1[0], offset); + w1[0] = amd_bytealign_S (w0[2], w0[3], offset); + w0[3] = amd_bytealign_S (w0[1], w0[2], offset); + w0[2] = amd_bytealign_S (w0[0], w0[1], offset); + w0[1] = amd_bytealign_S ( 0, w0[0], offset); w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 2: - w7[3] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); - w7[2] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); - w7[1] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w7[0] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w6[3] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w6[2] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w6[1] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w6[0] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w5[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w5[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w5[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w5[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w4[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w4[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w4[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w4[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 2: + w7[3] = amd_bytealign_S (w7[0], w7[1], offset); + w7[2] = amd_bytealign_S (w6[3], w7[0], offset); + w7[1] = amd_bytealign_S (w6[2], w6[3], offset); + w7[0] = amd_bytealign_S (w6[1], w6[2], offset); + w6[3] = amd_bytealign_S (w6[0], w6[1], offset); + w6[2] = amd_bytealign_S (w5[3], w6[0], offset); + w6[1] = amd_bytealign_S (w5[2], w5[3], offset); + w6[0] = amd_bytealign_S (w5[1], w5[2], offset); + w5[3] = amd_bytealign_S (w5[0], w5[1], offset); + w5[2] = amd_bytealign_S (w4[3], w5[0], offset); + w5[1] = amd_bytealign_S (w4[2], w4[3], offset); + w5[0] = amd_bytealign_S (w4[1], w4[2], offset); + w4[3] = amd_bytealign_S (w4[0], w4[1], offset); + w4[2] = amd_bytealign_S (w3[3], w4[0], offset); + w4[1] = amd_bytealign_S (w3[2], w3[3], offset); + w4[0] = amd_bytealign_S (w3[1], w3[2], offset); + w3[3] = amd_bytealign_S (w3[0], w3[1], offset); + w3[2] = amd_bytealign_S (w2[3], w3[0], offset); + w3[1] = amd_bytealign_S (w2[2], w2[3], offset); + w3[0] = amd_bytealign_S (w2[1], w2[2], offset); + w2[3] = amd_bytealign_S (w2[0], w2[1], offset); + w2[2] = amd_bytealign_S (w1[3], w2[0], offset); + w2[1] = amd_bytealign_S (w1[2], w1[3], offset); + w2[0] = amd_bytealign_S (w1[1], w1[2], offset); + w1[3] = amd_bytealign_S (w1[0], w1[1], offset); + w1[2] = amd_bytealign_S (w0[3], w1[0], offset); + w1[1] = amd_bytealign_S (w0[2], w0[3], offset); + w1[0] = amd_bytealign_S (w0[1], w0[2], offset); + w0[3] = amd_bytealign_S (w0[0], w0[1], offset); + w0[2] = amd_bytealign_S ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 3: - w7[3] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); - w7[2] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w7[1] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w7[0] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w6[3] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w6[2] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w6[1] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w6[0] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w5[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w5[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w5[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w5[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w4[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w4[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w4[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w4[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 3: + w7[3] = amd_bytealign_S (w6[3], w7[0], offset); + w7[2] = amd_bytealign_S (w6[2], w6[3], offset); + w7[1] = amd_bytealign_S (w6[1], w6[2], offset); + w7[0] = amd_bytealign_S (w6[0], w6[1], offset); + w6[3] = amd_bytealign_S (w5[3], w6[0], offset); + w6[2] = amd_bytealign_S (w5[2], w5[3], offset); + w6[1] = amd_bytealign_S (w5[1], w5[2], offset); + w6[0] = amd_bytealign_S (w5[0], w5[1], offset); + w5[3] = amd_bytealign_S (w4[3], w5[0], offset); + w5[2] = amd_bytealign_S (w4[2], w4[3], offset); + w5[1] = amd_bytealign_S (w4[1], w4[2], offset); + w5[0] = amd_bytealign_S (w4[0], w4[1], offset); + w4[3] = amd_bytealign_S (w3[3], w4[0], offset); + w4[2] = amd_bytealign_S (w3[2], w3[3], offset); + w4[1] = amd_bytealign_S (w3[1], w3[2], offset); + w4[0] = amd_bytealign_S (w3[0], w3[1], offset); + w3[3] = amd_bytealign_S (w2[3], w3[0], offset); + w3[2] = amd_bytealign_S (w2[2], w2[3], offset); + w3[1] = amd_bytealign_S (w2[1], w2[2], offset); + w3[0] = amd_bytealign_S (w2[0], w2[1], offset); + w2[3] = amd_bytealign_S (w1[3], w2[0], offset); + w2[2] = amd_bytealign_S (w1[2], w1[3], offset); + w2[1] = amd_bytealign_S (w1[1], w1[2], offset); + w2[0] = amd_bytealign_S (w1[0], w1[1], offset); + w1[3] = amd_bytealign_S (w0[3], w1[0], offset); + w1[2] = amd_bytealign_S (w0[2], w0[3], offset); + w1[1] = amd_bytealign_S (w0[1], w0[2], offset); + w1[0] = amd_bytealign_S (w0[0], w0[1], offset); + w0[3] = amd_bytealign_S ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 4: - w7[3] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w7[2] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w7[1] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w7[0] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w6[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w6[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w6[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w6[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w5[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w5[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w5[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w5[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w4[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w4[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w4[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w4[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 4: + w7[3] = amd_bytealign_S (w6[2], w6[3], offset); + w7[2] = amd_bytealign_S (w6[1], w6[2], offset); + w7[1] = amd_bytealign_S (w6[0], w6[1], offset); + w7[0] = amd_bytealign_S (w5[3], w6[0], offset); + w6[3] = amd_bytealign_S (w5[2], w5[3], offset); + w6[2] = amd_bytealign_S (w5[1], w5[2], offset); + w6[1] = amd_bytealign_S (w5[0], w5[1], offset); + w6[0] = amd_bytealign_S (w4[3], w5[0], offset); + w5[3] = amd_bytealign_S (w4[2], w4[3], offset); + w5[2] = amd_bytealign_S (w4[1], w4[2], offset); + w5[1] = amd_bytealign_S (w4[0], w4[1], offset); + w5[0] = amd_bytealign_S (w3[3], w4[0], offset); + w4[3] = amd_bytealign_S (w3[2], w3[3], offset); + w4[2] = amd_bytealign_S (w3[1], w3[2], offset); + w4[1] = amd_bytealign_S (w3[0], w3[1], offset); + w4[0] = amd_bytealign_S (w2[3], w3[0], offset); + w3[3] = amd_bytealign_S (w2[2], w2[3], offset); + w3[2] = amd_bytealign_S (w2[1], w2[2], offset); + w3[1] = amd_bytealign_S (w2[0], w2[1], offset); + w3[0] = amd_bytealign_S (w1[3], w2[0], offset); + w2[3] = amd_bytealign_S (w1[2], w1[3], offset); + w2[2] = amd_bytealign_S (w1[1], w1[2], offset); + w2[1] = amd_bytealign_S (w1[0], w1[1], offset); + w2[0] = amd_bytealign_S (w0[3], w1[0], offset); + w1[3] = amd_bytealign_S (w0[2], w0[3], offset); + w1[2] = amd_bytealign_S (w0[1], w0[2], offset); + w1[1] = amd_bytealign_S (w0[0], w0[1], offset); + w1[0] = amd_bytealign_S ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 5: - w7[3] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w7[2] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w7[1] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w7[0] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w6[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w6[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w6[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w6[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w5[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w5[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w5[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w5[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w4[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w4[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w4[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w4[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 5: + w7[3] = amd_bytealign_S (w6[1], w6[2], offset); + w7[2] = amd_bytealign_S (w6[0], w6[1], offset); + w7[1] = amd_bytealign_S (w5[3], w6[0], offset); + w7[0] = amd_bytealign_S (w5[2], w5[3], offset); + w6[3] = amd_bytealign_S (w5[1], w5[2], offset); + w6[2] = amd_bytealign_S (w5[0], w5[1], offset); + w6[1] = amd_bytealign_S (w4[3], w5[0], offset); + w6[0] = amd_bytealign_S (w4[2], w4[3], offset); + w5[3] = amd_bytealign_S (w4[1], w4[2], offset); + w5[2] = amd_bytealign_S (w4[0], w4[1], offset); + w5[1] = amd_bytealign_S (w3[3], w4[0], offset); + w5[0] = amd_bytealign_S (w3[2], w3[3], offset); + w4[3] = amd_bytealign_S (w3[1], w3[2], offset); + w4[2] = amd_bytealign_S (w3[0], w3[1], offset); + w4[1] = amd_bytealign_S (w2[3], w3[0], offset); + w4[0] = amd_bytealign_S (w2[2], w2[3], offset); + w3[3] = amd_bytealign_S (w2[1], w2[2], offset); + w3[2] = amd_bytealign_S (w2[0], w2[1], offset); + w3[1] = amd_bytealign_S (w1[3], w2[0], offset); + w3[0] = amd_bytealign_S (w1[2], w1[3], offset); + w2[3] = amd_bytealign_S (w1[1], w1[2], offset); + w2[2] = amd_bytealign_S (w1[0], w1[1], offset); + w2[1] = amd_bytealign_S (w0[3], w1[0], offset); + w2[0] = amd_bytealign_S (w0[2], w0[3], offset); + w1[3] = amd_bytealign_S (w0[1], w0[2], offset); + w1[2] = amd_bytealign_S (w0[0], w0[1], offset); + w1[1] = amd_bytealign_S ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 6: - w7[3] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w7[2] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w7[1] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w7[0] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w6[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w6[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w6[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w6[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w5[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w5[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w5[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w5[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w4[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w4[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w4[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w4[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 6: + w7[3] = amd_bytealign_S (w6[0], w6[1], offset); + w7[2] = amd_bytealign_S (w5[3], w6[0], offset); + w7[1] = amd_bytealign_S (w5[2], w5[3], offset); + w7[0] = amd_bytealign_S (w5[1], w5[2], offset); + w6[3] = amd_bytealign_S (w5[0], w5[1], offset); + w6[2] = amd_bytealign_S (w4[3], w5[0], offset); + w6[1] = amd_bytealign_S (w4[2], w4[3], offset); + w6[0] = amd_bytealign_S (w4[1], w4[2], offset); + w5[3] = amd_bytealign_S (w4[0], w4[1], offset); + w5[2] = amd_bytealign_S (w3[3], w4[0], offset); + w5[1] = amd_bytealign_S (w3[2], w3[3], offset); + w5[0] = amd_bytealign_S (w3[1], w3[2], offset); + w4[3] = amd_bytealign_S (w3[0], w3[1], offset); + w4[2] = amd_bytealign_S (w2[3], w3[0], offset); + w4[1] = amd_bytealign_S (w2[2], w2[3], offset); + w4[0] = amd_bytealign_S (w2[1], w2[2], offset); + w3[3] = amd_bytealign_S (w2[0], w2[1], offset); + w3[2] = amd_bytealign_S (w1[3], w2[0], offset); + w3[1] = amd_bytealign_S (w1[2], w1[3], offset); + w3[0] = amd_bytealign_S (w1[1], w1[2], offset); + w2[3] = amd_bytealign_S (w1[0], w1[1], offset); + w2[2] = amd_bytealign_S (w0[3], w1[0], offset); + w2[1] = amd_bytealign_S (w0[2], w0[3], offset); + w2[0] = amd_bytealign_S (w0[1], w0[2], offset); + w1[3] = amd_bytealign_S (w0[0], w0[1], offset); + w1[2] = amd_bytealign_S ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; @@ -18741,64 +36905,34 @@ void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 7: - w7[3] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w7[2] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w7[1] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w7[0] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w6[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w6[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w6[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w6[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w5[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w5[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w5[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w5[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w4[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w4[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w4[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w4[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 7: + w7[3] = amd_bytealign_S (w5[3], w6[0], offset); + w7[2] = amd_bytealign_S (w5[2], w5[3], offset); + w7[1] = amd_bytealign_S (w5[1], w5[2], offset); + w7[0] = amd_bytealign_S (w5[0], w5[1], offset); + w6[3] = amd_bytealign_S (w4[3], w5[0], offset); + w6[2] = amd_bytealign_S (w4[2], w4[3], offset); + w6[1] = amd_bytealign_S (w4[1], w4[2], offset); + w6[0] = amd_bytealign_S (w4[0], w4[1], offset); + w5[3] = amd_bytealign_S (w3[3], w4[0], offset); + w5[2] = amd_bytealign_S (w3[2], w3[3], offset); + w5[1] = amd_bytealign_S (w3[1], w3[2], offset); + w5[0] = amd_bytealign_S (w3[0], w3[1], offset); + w4[3] = amd_bytealign_S (w2[3], w3[0], offset); + w4[2] = amd_bytealign_S (w2[2], w2[3], offset); + w4[1] = amd_bytealign_S (w2[1], w2[2], offset); + w4[0] = amd_bytealign_S (w2[0], w2[1], offset); + w3[3] = amd_bytealign_S (w1[3], w2[0], offset); + w3[2] = amd_bytealign_S (w1[2], w1[3], offset); + w3[1] = amd_bytealign_S (w1[1], w1[2], offset); + w3[0] = amd_bytealign_S (w1[0], w1[1], offset); + w2[3] = amd_bytealign_S (w0[3], w1[0], offset); + w2[2] = amd_bytealign_S (w0[2], w0[3], offset); + w2[1] = amd_bytealign_S (w0[1], w0[2], offset); + w2[0] = amd_bytealign_S (w0[0], w0[1], offset); + w1[3] = amd_bytealign_S ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; @@ -18807,62 +36941,33 @@ void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 8: - w7[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w7[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w7[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w7[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w6[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w6[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w6[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w6[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w5[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w5[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w5[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w5[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w4[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w4[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w4[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w4[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 8: + w7[3] = amd_bytealign_S (w5[2], w5[3], offset); + w7[2] = amd_bytealign_S (w5[1], w5[2], offset); + w7[1] = amd_bytealign_S (w5[0], w5[1], offset); + w7[0] = amd_bytealign_S (w4[3], w5[0], offset); + w6[3] = amd_bytealign_S (w4[2], w4[3], offset); + w6[2] = amd_bytealign_S (w4[1], w4[2], offset); + w6[1] = amd_bytealign_S (w4[0], w4[1], offset); + w6[0] = amd_bytealign_S (w3[3], w4[0], offset); + w5[3] = amd_bytealign_S (w3[2], w3[3], offset); + w5[2] = amd_bytealign_S (w3[1], w3[2], offset); + w5[1] = amd_bytealign_S (w3[0], w3[1], offset); + w5[0] = amd_bytealign_S (w2[3], w3[0], offset); + w4[3] = amd_bytealign_S (w2[2], w2[3], offset); + w4[2] = amd_bytealign_S (w2[1], w2[2], offset); + w4[1] = amd_bytealign_S (w2[0], w2[1], offset); + w4[0] = amd_bytealign_S (w1[3], w2[0], offset); + w3[3] = amd_bytealign_S (w1[2], w1[3], offset); + w3[2] = amd_bytealign_S (w1[1], w1[2], offset); + w3[1] = amd_bytealign_S (w1[0], w1[1], offset); + w3[0] = amd_bytealign_S (w0[3], w1[0], offset); + w2[3] = amd_bytealign_S (w0[2], w0[3], offset); + w2[2] = amd_bytealign_S (w0[1], w0[2], offset); + w2[1] = amd_bytealign_S (w0[0], w0[1], offset); + w2[0] = amd_bytealign_S ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; @@ -18872,60 +36977,32 @@ void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; - case 9: - w7[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w7[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w7[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w7[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w6[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w6[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w6[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w6[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w5[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w5[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w5[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w5[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w4[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w4[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w4[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w4[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + case 9: + w7[3] = amd_bytealign_S (w5[1], w5[2], offset); + w7[2] = amd_bytealign_S (w5[0], w5[1], offset); + w7[1] = amd_bytealign_S (w4[3], w5[0], offset); + w7[0] = amd_bytealign_S (w4[2], w4[3], offset); + w6[3] = amd_bytealign_S (w4[1], w4[2], offset); + w6[2] = amd_bytealign_S (w4[0], w4[1], offset); + w6[1] = amd_bytealign_S (w3[3], w4[0], offset); + w6[0] = amd_bytealign_S (w3[2], w3[3], offset); + w5[3] = amd_bytealign_S (w3[1], w3[2], offset); + w5[2] = amd_bytealign_S (w3[0], w3[1], offset); + w5[1] = amd_bytealign_S (w2[3], w3[0], offset); + w5[0] = amd_bytealign_S (w2[2], w2[3], offset); + w4[3] = amd_bytealign_S (w2[1], w2[2], offset); + w4[2] = amd_bytealign_S (w2[0], w2[1], offset); + w4[1] = amd_bytealign_S (w1[3], w2[0], offset); + w4[0] = amd_bytealign_S (w1[2], w1[3], offset); + w3[3] = amd_bytealign_S (w1[1], w1[2], offset); + w3[2] = amd_bytealign_S (w1[0], w1[1], offset); + w3[1] = amd_bytealign_S (w0[3], w1[0], offset); + w3[0] = amd_bytealign_S (w0[2], w0[3], offset); + w2[3] = amd_bytealign_S (w0[1], w0[2], offset); + w2[2] = amd_bytealign_S (w0[0], w0[1], offset); + w2[1] = amd_bytealign_S ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; @@ -18936,58 +37013,31 @@ void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 10: - w7[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w7[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w7[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w7[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w6[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w6[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w6[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w6[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w5[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w5[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w5[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w5[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w4[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w4[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w4[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w4[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign_S (w5[0], w5[1], offset); + w7[2] = amd_bytealign_S (w4[3], w5[0], offset); + w7[1] = amd_bytealign_S (w4[2], w4[3], offset); + w7[0] = amd_bytealign_S (w4[1], w4[2], offset); + w6[3] = amd_bytealign_S (w4[0], w4[1], offset); + w6[2] = amd_bytealign_S (w3[3], w4[0], offset); + w6[1] = amd_bytealign_S (w3[2], w3[3], offset); + w6[0] = amd_bytealign_S (w3[1], w3[2], offset); + w5[3] = amd_bytealign_S (w3[0], w3[1], offset); + w5[2] = amd_bytealign_S (w2[3], w3[0], offset); + w5[1] = amd_bytealign_S (w2[2], w2[3], offset); + w5[0] = amd_bytealign_S (w2[1], w2[2], offset); + w4[3] = amd_bytealign_S (w2[0], w2[1], offset); + w4[2] = amd_bytealign_S (w1[3], w2[0], offset); + w4[1] = amd_bytealign_S (w1[2], w1[3], offset); + w4[0] = amd_bytealign_S (w1[1], w1[2], offset); + w3[3] = amd_bytealign_S (w1[0], w1[1], offset); + w3[2] = amd_bytealign_S (w0[3], w1[0], offset); + w3[1] = amd_bytealign_S (w0[2], w0[3], offset); + w3[0] = amd_bytealign_S (w0[1], w0[2], offset); + w2[3] = amd_bytealign_S (w0[0], w0[1], offset); + w2[2] = amd_bytealign_S ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; @@ -18999,56 +37049,30 @@ void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 11: - w7[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w7[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w7[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w7[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w6[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w6[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w6[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w6[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w5[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w5[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w5[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w5[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w4[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w4[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w4[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w4[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign_S (w4[3], w5[0], offset); + w7[2] = amd_bytealign_S (w4[2], w4[3], offset); + w7[1] = amd_bytealign_S (w4[1], w4[2], offset); + w7[0] = amd_bytealign_S (w4[0], w4[1], offset); + w6[3] = amd_bytealign_S (w3[3], w4[0], offset); + w6[2] = amd_bytealign_S (w3[2], w3[3], offset); + w6[1] = amd_bytealign_S (w3[1], w3[2], offset); + w6[0] = amd_bytealign_S (w3[0], w3[1], offset); + w5[3] = amd_bytealign_S (w2[3], w3[0], offset); + w5[2] = amd_bytealign_S (w2[2], w2[3], offset); + w5[1] = amd_bytealign_S (w2[1], w2[2], offset); + w5[0] = amd_bytealign_S (w2[0], w2[1], offset); + w4[3] = amd_bytealign_S (w1[3], w2[0], offset); + w4[2] = amd_bytealign_S (w1[2], w1[3], offset); + w4[1] = amd_bytealign_S (w1[1], w1[2], offset); + w4[0] = amd_bytealign_S (w1[0], w1[1], offset); + w3[3] = amd_bytealign_S (w0[3], w1[0], offset); + w3[2] = amd_bytealign_S (w0[2], w0[3], offset); + w3[1] = amd_bytealign_S (w0[1], w0[2], offset); + w3[0] = amd_bytealign_S (w0[0], w0[1], offset); + w2[3] = amd_bytealign_S ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; @@ -19061,54 +37085,29 @@ void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 12: - w7[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w7[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w7[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w7[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w6[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w6[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w6[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w6[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w5[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w5[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w5[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w5[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w4[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w4[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w4[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w4[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign_S (w4[2], w4[3], offset); + w7[2] = amd_bytealign_S (w4[1], w4[2], offset); + w7[1] = amd_bytealign_S (w4[0], w4[1], offset); + w7[0] = amd_bytealign_S (w3[3], w4[0], offset); + w6[3] = amd_bytealign_S (w3[2], w3[3], offset); + w6[2] = amd_bytealign_S (w3[1], w3[2], offset); + w6[1] = amd_bytealign_S (w3[0], w3[1], offset); + w6[0] = amd_bytealign_S (w2[3], w3[0], offset); + w5[3] = amd_bytealign_S (w2[2], w2[3], offset); + w5[2] = amd_bytealign_S (w2[1], w2[2], offset); + w5[1] = amd_bytealign_S (w2[0], w2[1], offset); + w5[0] = amd_bytealign_S (w1[3], w2[0], offset); + w4[3] = amd_bytealign_S (w1[2], w1[3], offset); + w4[2] = amd_bytealign_S (w1[1], w1[2], offset); + w4[1] = amd_bytealign_S (w1[0], w1[1], offset); + w4[0] = amd_bytealign_S (w0[3], w1[0], offset); + w3[3] = amd_bytealign_S (w0[2], w0[3], offset); + w3[2] = amd_bytealign_S (w0[1], w0[2], offset); + w3[1] = amd_bytealign_S (w0[0], w0[1], offset); + w3[0] = amd_bytealign_S ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; @@ -19122,52 +37121,28 @@ void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 13: - w7[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w7[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w7[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w7[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w6[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w6[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w6[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w6[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w5[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w5[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w5[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w5[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w4[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w4[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w4[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w4[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign_S (w4[1], w4[2], offset); + w7[2] = amd_bytealign_S (w4[0], w4[1], offset); + w7[1] = amd_bytealign_S (w3[3], w4[0], offset); + w7[0] = amd_bytealign_S (w3[2], w3[3], offset); + w6[3] = amd_bytealign_S (w3[1], w3[2], offset); + w6[2] = amd_bytealign_S (w3[0], w3[1], offset); + w6[1] = amd_bytealign_S (w2[3], w3[0], offset); + w6[0] = amd_bytealign_S (w2[2], w2[3], offset); + w5[3] = amd_bytealign_S (w2[1], w2[2], offset); + w5[2] = amd_bytealign_S (w2[0], w2[1], offset); + w5[1] = amd_bytealign_S (w1[3], w2[0], offset); + w5[0] = amd_bytealign_S (w1[2], w1[3], offset); + w4[3] = amd_bytealign_S (w1[1], w1[2], offset); + w4[2] = amd_bytealign_S (w1[0], w1[1], offset); + w4[1] = amd_bytealign_S (w0[3], w1[0], offset); + w4[0] = amd_bytealign_S (w0[2], w0[3], offset); + w3[3] = amd_bytealign_S (w0[1], w0[2], offset); + w3[2] = amd_bytealign_S (w0[0], w0[1], offset); + w3[1] = amd_bytealign_S ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; @@ -19182,50 +37157,27 @@ void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 14: - w7[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w7[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w7[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w7[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w6[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w6[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w6[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w6[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w5[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w5[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w5[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w5[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w4[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w4[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w4[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w4[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign_S (w4[0], w4[1], offset); + w7[2] = amd_bytealign_S (w3[3], w4[0], offset); + w7[1] = amd_bytealign_S (w3[2], w3[3], offset); + w7[0] = amd_bytealign_S (w3[1], w3[2], offset); + w6[3] = amd_bytealign_S (w3[0], w3[1], offset); + w6[2] = amd_bytealign_S (w2[3], w3[0], offset); + w6[1] = amd_bytealign_S (w2[2], w2[3], offset); + w6[0] = amd_bytealign_S (w2[1], w2[2], offset); + w5[3] = amd_bytealign_S (w2[0], w2[1], offset); + w5[2] = amd_bytealign_S (w1[3], w2[0], offset); + w5[1] = amd_bytealign_S (w1[2], w1[3], offset); + w5[0] = amd_bytealign_S (w1[1], w1[2], offset); + w4[3] = amd_bytealign_S (w1[0], w1[1], offset); + w4[2] = amd_bytealign_S (w0[3], w1[0], offset); + w4[1] = amd_bytealign_S (w0[2], w0[3], offset); + w4[0] = amd_bytealign_S (w0[1], w0[2], offset); + w3[3] = amd_bytealign_S (w0[0], w0[1], offset); + w3[2] = amd_bytealign_S ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; @@ -19241,48 +37193,26 @@ void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - break; case 15: - w7[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w7[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w7[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w7[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w6[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w6[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w6[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w6[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w5[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w5[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w5[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w5[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w4[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w4[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w4[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w4[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w7[3] = amd_bytealign_S (w3[3], w4[0], offset); + w7[2] = amd_bytealign_S (w3[2], w3[3], offset); + w7[1] = amd_bytealign_S (w3[1], w3[2], offset); + w7[0] = amd_bytealign_S (w3[0], w3[1], offset); + w6[3] = amd_bytealign_S (w2[3], w3[0], offset); + w6[2] = amd_bytealign_S (w2[2], w2[3], offset); + w6[1] = amd_bytealign_S (w2[1], w2[2], offset); + w6[0] = amd_bytealign_S (w2[0], w2[1], offset); + w5[3] = amd_bytealign_S (w1[3], w2[0], offset); + w5[2] = amd_bytealign_S (w1[2], w1[3], offset); + w5[1] = amd_bytealign_S (w1[1], w1[2], offset); + w5[0] = amd_bytealign_S (w1[0], w1[1], offset); + w4[3] = amd_bytealign_S (w0[3], w1[0], offset); + w4[2] = amd_bytealign_S (w0[2], w0[3], offset); + w4[1] = amd_bytealign_S (w0[1], w0[2], offset); + w4[0] = amd_bytealign_S (w0[0], w0[1], offset); + w3[3] = amd_bytealign_S ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; @@ -19299,29 +37229,617 @@ void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4 w0[1] = 0; w0[0] = 0; - if (offset_mod_4 == 0) - { - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } + break; + + case 16: + w7[3] = amd_bytealign_S (w3[2], w3[3], offset); + w7[2] = amd_bytealign_S (w3[1], w3[2], offset); + w7[1] = amd_bytealign_S (w3[0], w3[1], offset); + w7[0] = amd_bytealign_S (w2[3], w3[0], offset); + w6[3] = amd_bytealign_S (w2[2], w2[3], offset); + w6[2] = amd_bytealign_S (w2[1], w2[2], offset); + w6[1] = amd_bytealign_S (w2[0], w2[1], offset); + w6[0] = amd_bytealign_S (w1[3], w2[0], offset); + w5[3] = amd_bytealign_S (w1[2], w1[3], offset); + w5[2] = amd_bytealign_S (w1[1], w1[2], offset); + w5[1] = amd_bytealign_S (w1[0], w1[1], offset); + w5[0] = amd_bytealign_S (w0[3], w1[0], offset); + w4[3] = amd_bytealign_S (w0[2], w0[3], offset); + w4[2] = amd_bytealign_S (w0[1], w0[2], offset); + w4[1] = amd_bytealign_S (w0[0], w0[1], offset); + w4[0] = amd_bytealign_S ( 0, w0[0], offset); + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 17: + w7[3] = amd_bytealign_S (w3[1], w3[2], offset); + w7[2] = amd_bytealign_S (w3[0], w3[1], offset); + w7[1] = amd_bytealign_S (w2[3], w3[0], offset); + w7[0] = amd_bytealign_S (w2[2], w2[3], offset); + w6[3] = amd_bytealign_S (w2[1], w2[2], offset); + w6[2] = amd_bytealign_S (w2[0], w2[1], offset); + w6[1] = amd_bytealign_S (w1[3], w2[0], offset); + w6[0] = amd_bytealign_S (w1[2], w1[3], offset); + w5[3] = amd_bytealign_S (w1[1], w1[2], offset); + w5[2] = amd_bytealign_S (w1[0], w1[1], offset); + w5[1] = amd_bytealign_S (w0[3], w1[0], offset); + w5[0] = amd_bytealign_S (w0[2], w0[3], offset); + w4[3] = amd_bytealign_S (w0[1], w0[2], offset); + w4[2] = amd_bytealign_S (w0[0], w0[1], offset); + w4[1] = amd_bytealign_S ( 0, w0[0], offset); + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 18: + w7[3] = amd_bytealign_S (w3[0], w3[1], offset); + w7[2] = amd_bytealign_S (w2[3], w3[0], offset); + w7[1] = amd_bytealign_S (w2[2], w2[3], offset); + w7[0] = amd_bytealign_S (w2[1], w2[2], offset); + w6[3] = amd_bytealign_S (w2[0], w2[1], offset); + w6[2] = amd_bytealign_S (w1[3], w2[0], offset); + w6[1] = amd_bytealign_S (w1[2], w1[3], offset); + w6[0] = amd_bytealign_S (w1[1], w1[2], offset); + w5[3] = amd_bytealign_S (w1[0], w1[1], offset); + w5[2] = amd_bytealign_S (w0[3], w1[0], offset); + w5[1] = amd_bytealign_S (w0[2], w0[3], offset); + w5[0] = amd_bytealign_S (w0[1], w0[2], offset); + w4[3] = amd_bytealign_S (w0[0], w0[1], offset); + w4[2] = amd_bytealign_S ( 0, w0[0], offset); + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 19: + w7[3] = amd_bytealign_S (w2[3], w3[0], offset); + w7[2] = amd_bytealign_S (w2[2], w2[3], offset); + w7[1] = amd_bytealign_S (w2[1], w2[2], offset); + w7[0] = amd_bytealign_S (w2[0], w2[1], offset); + w6[3] = amd_bytealign_S (w1[3], w2[0], offset); + w6[2] = amd_bytealign_S (w1[2], w1[3], offset); + w6[1] = amd_bytealign_S (w1[1], w1[2], offset); + w6[0] = amd_bytealign_S (w1[0], w1[1], offset); + w5[3] = amd_bytealign_S (w0[3], w1[0], offset); + w5[2] = amd_bytealign_S (w0[2], w0[3], offset); + w5[1] = amd_bytealign_S (w0[1], w0[2], offset); + w5[0] = amd_bytealign_S (w0[0], w0[1], offset); + w4[3] = amd_bytealign_S ( 0, w0[0], offset); + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 20: + w7[3] = amd_bytealign_S (w2[2], w2[3], offset); + w7[2] = amd_bytealign_S (w2[1], w2[2], offset); + w7[1] = amd_bytealign_S (w2[0], w2[1], offset); + w7[0] = amd_bytealign_S (w1[3], w2[0], offset); + w6[3] = amd_bytealign_S (w1[2], w1[3], offset); + w6[2] = amd_bytealign_S (w1[1], w1[2], offset); + w6[1] = amd_bytealign_S (w1[0], w1[1], offset); + w6[0] = amd_bytealign_S (w0[3], w1[0], offset); + w5[3] = amd_bytealign_S (w0[2], w0[3], offset); + w5[2] = amd_bytealign_S (w0[1], w0[2], offset); + w5[1] = amd_bytealign_S (w0[0], w0[1], offset); + w5[0] = amd_bytealign_S ( 0, w0[0], offset); + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 21: + w7[3] = amd_bytealign_S (w2[1], w2[2], offset); + w7[2] = amd_bytealign_S (w2[0], w2[1], offset); + w7[1] = amd_bytealign_S (w1[3], w2[0], offset); + w7[0] = amd_bytealign_S (w1[2], w1[3], offset); + w6[3] = amd_bytealign_S (w1[1], w1[2], offset); + w6[2] = amd_bytealign_S (w1[0], w1[1], offset); + w6[1] = amd_bytealign_S (w0[3], w1[0], offset); + w6[0] = amd_bytealign_S (w0[2], w0[3], offset); + w5[3] = amd_bytealign_S (w0[1], w0[2], offset); + w5[2] = amd_bytealign_S (w0[0], w0[1], offset); + w5[1] = amd_bytealign_S ( 0, w0[0], offset); + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 22: + w7[3] = amd_bytealign_S (w2[0], w2[1], offset); + w7[2] = amd_bytealign_S (w1[3], w2[0], offset); + w7[1] = amd_bytealign_S (w1[2], w1[3], offset); + w7[0] = amd_bytealign_S (w1[1], w1[2], offset); + w6[3] = amd_bytealign_S (w1[0], w1[1], offset); + w6[2] = amd_bytealign_S (w0[3], w1[0], offset); + w6[1] = amd_bytealign_S (w0[2], w0[3], offset); + w6[0] = amd_bytealign_S (w0[1], w0[2], offset); + w5[3] = amd_bytealign_S (w0[0], w0[1], offset); + w5[2] = amd_bytealign_S ( 0, w0[0], offset); + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 23: + w7[3] = amd_bytealign_S (w1[3], w2[0], offset); + w7[2] = amd_bytealign_S (w1[2], w1[3], offset); + w7[1] = amd_bytealign_S (w1[1], w1[2], offset); + w7[0] = amd_bytealign_S (w1[0], w1[1], offset); + w6[3] = amd_bytealign_S (w0[3], w1[0], offset); + w6[2] = amd_bytealign_S (w0[2], w0[3], offset); + w6[1] = amd_bytealign_S (w0[1], w0[2], offset); + w6[0] = amd_bytealign_S (w0[0], w0[1], offset); + w5[3] = amd_bytealign_S ( 0, w0[0], offset); + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 24: + w7[3] = amd_bytealign_S (w1[2], w1[3], offset); + w7[2] = amd_bytealign_S (w1[1], w1[2], offset); + w7[1] = amd_bytealign_S (w1[0], w1[1], offset); + w7[0] = amd_bytealign_S (w0[3], w1[0], offset); + w6[3] = amd_bytealign_S (w0[2], w0[3], offset); + w6[2] = amd_bytealign_S (w0[1], w0[2], offset); + w6[1] = amd_bytealign_S (w0[0], w0[1], offset); + w6[0] = amd_bytealign_S ( 0, w0[0], offset); + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 25: + w7[3] = amd_bytealign_S (w1[1], w1[2], offset); + w7[2] = amd_bytealign_S (w1[0], w1[1], offset); + w7[1] = amd_bytealign_S (w0[3], w1[0], offset); + w7[0] = amd_bytealign_S (w0[2], w0[3], offset); + w6[3] = amd_bytealign_S (w0[1], w0[2], offset); + w6[2] = amd_bytealign_S (w0[0], w0[1], offset); + w6[1] = amd_bytealign_S ( 0, w0[0], offset); + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 26: + w7[3] = amd_bytealign_S (w1[0], w1[1], offset); + w7[2] = amd_bytealign_S (w0[3], w1[0], offset); + w7[1] = amd_bytealign_S (w0[2], w0[3], offset); + w7[0] = amd_bytealign_S (w0[1], w0[2], offset); + w6[3] = amd_bytealign_S (w0[0], w0[1], offset); + w6[2] = amd_bytealign_S ( 0, w0[0], offset); + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 27: + w7[3] = amd_bytealign_S (w0[3], w1[0], offset); + w7[2] = amd_bytealign_S (w0[2], w0[3], offset); + w7[1] = amd_bytealign_S (w0[1], w0[2], offset); + w7[0] = amd_bytealign_S (w0[0], w0[1], offset); + w6[3] = amd_bytealign_S ( 0, w0[0], offset); + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 28: + w7[3] = amd_bytealign_S (w0[2], w0[3], offset); + w7[2] = amd_bytealign_S (w0[1], w0[2], offset); + w7[1] = amd_bytealign_S (w0[0], w0[1], offset); + w7[0] = amd_bytealign_S ( 0, w0[0], offset); + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 29: + w7[3] = amd_bytealign_S (w0[1], w0[2], offset); + w7[2] = amd_bytealign_S (w0[0], w0[1], offset); + w7[1] = amd_bytealign_S ( 0, w0[0], offset); + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 30: + w7[3] = amd_bytealign_S (w0[0], w0[1], offset); + w7[2] = amd_bytealign_S ( 0, w0[0], offset); + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 31: + w7[3] = amd_bytealign_S ( 0, w0[0], offset); + w7[2] = 0; + w7[1] = 0; + w7[0] = 0; + w6[3] = 0; + w6[2] = 0; + w6[1] = 0; + w6[0] = 0; + w5[3] = 0; + w5[2] = 0; + w5[1] = 0; + w5[0] = 0; + w4[3] = 0; + w4[2] = 0; + w4[1] = 0; + w4[0] = 0; + w3[3] = 0; + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; } + + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); + w1[1] = swap32_S (w1[1]); + w1[2] = swap32_S (w1[2]); + w1[3] = swap32_S (w1[3]); + w2[0] = swap32_S (w2[0]); + w2[1] = swap32_S (w2[1]); + w2[2] = swap32_S (w2[2]); + w2[3] = swap32_S (w2[3]); + w3[0] = swap32_S (w3[0]); + w3[1] = swap32_S (w3[1]); + w3[2] = swap32_S (w3[2]); + w3[3] = swap32_S (w3[3]); + w4[0] = swap32_S (w4[0]); + w4[1] = swap32_S (w4[1]); + w4[2] = swap32_S (w4[2]); + w4[3] = swap32_S (w4[3]); + w5[0] = swap32_S (w5[0]); + w5[1] = swap32_S (w5[1]); + w5[2] = swap32_S (w5[2]); + w5[3] = swap32_S (w5[3]); + w6[0] = swap32_S (w6[0]); + w6[1] = swap32_S (w6[1]); + w6[2] = swap32_S (w6[2]); + w6[3] = swap32_S (w6[3]); + w7[0] = swap32_S (w7[0]); + w7[1] = swap32_S (w7[1]); + w7[2] = swap32_S (w7[2]); + w7[3] = swap32_S (w7[3]); #endif #ifdef IS_NV @@ -25593,868 +44111,479 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC + + #pragma unroll + for (int i = 0; i < 64; i++) w[i] = swap32_S (w[i]); + switch (offset / 4) { case 0: - w[63] = amd_bytealign_S (w[63], w[62], offset_minus_4); - w[62] = amd_bytealign_S (w[62], w[61], offset_minus_4); - w[61] = amd_bytealign_S (w[61], w[60], offset_minus_4); - w[60] = amd_bytealign_S (w[60], w[59], offset_minus_4); - w[59] = amd_bytealign_S (w[59], w[58], offset_minus_4); - w[58] = amd_bytealign_S (w[58], w[57], offset_minus_4); - w[57] = amd_bytealign_S (w[57], w[56], offset_minus_4); - w[56] = amd_bytealign_S (w[56], w[55], offset_minus_4); - w[55] = amd_bytealign_S (w[55], w[54], offset_minus_4); - w[54] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[53] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[52] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[51] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[50] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[49] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[48] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[47] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[46] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[45] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[44] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[43] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[42] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[41] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[40] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[39] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[38] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[37] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[36] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[35] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[34] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[33] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[32] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[31] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[30] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[29] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[28] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[27] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[26] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[25] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[24] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[23] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[22] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[21] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[20] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[19] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[18] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[17] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[16] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[15] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[14] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[13] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[12] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[11] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[10] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[ 8] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[ 7] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[ 6] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[ 5] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[ 4] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[ 3] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[ 2] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[ 1] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 0] = amd_bytealign_S (w[ 0], 0, offset_minus_4); - - if (offset_mod_4 == 0) - { - w[ 0] = w[ 1]; - w[ 1] = w[ 2]; - w[ 2] = w[ 3]; - w[ 3] = w[ 4]; - w[ 4] = w[ 5]; - w[ 5] = w[ 6]; - w[ 6] = w[ 7]; - w[ 7] = w[ 8]; - w[ 8] = w[ 9]; - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } + w[63] = amd_bytealign_S (w[62], w[63], offset); + w[62] = amd_bytealign_S (w[61], w[62], offset); + w[61] = amd_bytealign_S (w[60], w[61], offset); + w[60] = amd_bytealign_S (w[59], w[60], offset); + w[59] = amd_bytealign_S (w[58], w[59], offset); + w[58] = amd_bytealign_S (w[57], w[58], offset); + w[57] = amd_bytealign_S (w[56], w[57], offset); + w[56] = amd_bytealign_S (w[55], w[56], offset); + w[55] = amd_bytealign_S (w[54], w[55], offset); + w[54] = amd_bytealign_S (w[53], w[54], offset); + w[53] = amd_bytealign_S (w[52], w[53], offset); + w[52] = amd_bytealign_S (w[51], w[52], offset); + w[51] = amd_bytealign_S (w[50], w[51], offset); + w[50] = amd_bytealign_S (w[49], w[50], offset); + w[49] = amd_bytealign_S (w[48], w[49], offset); + w[48] = amd_bytealign_S (w[47], w[48], offset); + w[47] = amd_bytealign_S (w[46], w[47], offset); + w[46] = amd_bytealign_S (w[45], w[46], offset); + w[45] = amd_bytealign_S (w[44], w[45], offset); + w[44] = amd_bytealign_S (w[43], w[44], offset); + w[43] = amd_bytealign_S (w[42], w[43], offset); + w[42] = amd_bytealign_S (w[41], w[42], offset); + w[41] = amd_bytealign_S (w[40], w[41], offset); + w[40] = amd_bytealign_S (w[39], w[40], offset); + w[39] = amd_bytealign_S (w[38], w[39], offset); + w[38] = amd_bytealign_S (w[37], w[38], offset); + w[37] = amd_bytealign_S (w[36], w[37], offset); + w[36] = amd_bytealign_S (w[35], w[36], offset); + w[35] = amd_bytealign_S (w[34], w[35], offset); + w[34] = amd_bytealign_S (w[33], w[34], offset); + w[33] = amd_bytealign_S (w[32], w[33], offset); + w[32] = amd_bytealign_S (w[31], w[32], offset); + w[31] = amd_bytealign_S (w[30], w[31], offset); + w[30] = amd_bytealign_S (w[29], w[30], offset); + w[29] = amd_bytealign_S (w[28], w[29], offset); + w[28] = amd_bytealign_S (w[27], w[28], offset); + w[27] = amd_bytealign_S (w[26], w[27], offset); + w[26] = amd_bytealign_S (w[25], w[26], offset); + w[25] = amd_bytealign_S (w[24], w[25], offset); + w[24] = amd_bytealign_S (w[23], w[24], offset); + w[23] = amd_bytealign_S (w[22], w[23], offset); + w[22] = amd_bytealign_S (w[21], w[22], offset); + w[21] = amd_bytealign_S (w[20], w[21], offset); + w[20] = amd_bytealign_S (w[19], w[20], offset); + w[19] = amd_bytealign_S (w[18], w[19], offset); + w[18] = amd_bytealign_S (w[17], w[18], offset); + w[17] = amd_bytealign_S (w[16], w[17], offset); + w[16] = amd_bytealign_S (w[15], w[16], offset); + w[15] = amd_bytealign_S (w[14], w[15], offset); + w[14] = amd_bytealign_S (w[13], w[14], offset); + w[13] = amd_bytealign_S (w[12], w[13], offset); + w[12] = amd_bytealign_S (w[11], w[12], offset); + w[11] = amd_bytealign_S (w[10], w[11], offset); + w[10] = amd_bytealign_S (w[ 9], w[10], offset); + w[ 9] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[ 8] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[ 7] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[ 6] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[ 5] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[ 4] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[ 3] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 2] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 1] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 0] = amd_bytealign_S ( 0, w[ 0], offset); break; case 1: - w[63] = amd_bytealign_S (w[62], w[61], offset_minus_4); - w[62] = amd_bytealign_S (w[61], w[60], offset_minus_4); - w[61] = amd_bytealign_S (w[60], w[59], offset_minus_4); - w[60] = amd_bytealign_S (w[59], w[58], offset_minus_4); - w[59] = amd_bytealign_S (w[58], w[57], offset_minus_4); - w[58] = amd_bytealign_S (w[57], w[56], offset_minus_4); - w[57] = amd_bytealign_S (w[56], w[55], offset_minus_4); - w[56] = amd_bytealign_S (w[55], w[54], offset_minus_4); - w[55] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[54] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[53] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[52] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[51] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[50] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[49] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[48] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[47] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[46] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[45] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[44] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[43] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[42] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[41] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[40] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[39] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[38] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[37] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[36] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[35] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[34] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[33] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[32] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[31] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[30] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[29] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[28] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[27] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[26] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[25] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[24] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[23] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[22] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[21] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[20] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[19] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[18] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[17] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[16] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[15] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[14] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[13] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[12] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[11] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[10] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[ 8] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[ 7] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[ 6] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[ 5] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[ 4] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[ 3] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[ 2] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 1] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[61], w[62], offset); + w[62] = amd_bytealign_S (w[60], w[61], offset); + w[61] = amd_bytealign_S (w[59], w[60], offset); + w[60] = amd_bytealign_S (w[58], w[59], offset); + w[59] = amd_bytealign_S (w[57], w[58], offset); + w[58] = amd_bytealign_S (w[56], w[57], offset); + w[57] = amd_bytealign_S (w[55], w[56], offset); + w[56] = amd_bytealign_S (w[54], w[55], offset); + w[55] = amd_bytealign_S (w[53], w[54], offset); + w[54] = amd_bytealign_S (w[52], w[53], offset); + w[53] = amd_bytealign_S (w[51], w[52], offset); + w[52] = amd_bytealign_S (w[50], w[51], offset); + w[51] = amd_bytealign_S (w[49], w[50], offset); + w[50] = amd_bytealign_S (w[48], w[49], offset); + w[49] = amd_bytealign_S (w[47], w[48], offset); + w[48] = amd_bytealign_S (w[46], w[47], offset); + w[47] = amd_bytealign_S (w[45], w[46], offset); + w[46] = amd_bytealign_S (w[44], w[45], offset); + w[45] = amd_bytealign_S (w[43], w[44], offset); + w[44] = amd_bytealign_S (w[42], w[43], offset); + w[43] = amd_bytealign_S (w[41], w[42], offset); + w[42] = amd_bytealign_S (w[40], w[41], offset); + w[41] = amd_bytealign_S (w[39], w[40], offset); + w[40] = amd_bytealign_S (w[38], w[39], offset); + w[39] = amd_bytealign_S (w[37], w[38], offset); + w[38] = amd_bytealign_S (w[36], w[37], offset); + w[37] = amd_bytealign_S (w[35], w[36], offset); + w[36] = amd_bytealign_S (w[34], w[35], offset); + w[35] = amd_bytealign_S (w[33], w[34], offset); + w[34] = amd_bytealign_S (w[32], w[33], offset); + w[33] = amd_bytealign_S (w[31], w[32], offset); + w[32] = amd_bytealign_S (w[30], w[31], offset); + w[31] = amd_bytealign_S (w[29], w[30], offset); + w[30] = amd_bytealign_S (w[28], w[29], offset); + w[29] = amd_bytealign_S (w[27], w[28], offset); + w[28] = amd_bytealign_S (w[26], w[27], offset); + w[27] = amd_bytealign_S (w[25], w[26], offset); + w[26] = amd_bytealign_S (w[24], w[25], offset); + w[25] = amd_bytealign_S (w[23], w[24], offset); + w[24] = amd_bytealign_S (w[22], w[23], offset); + w[23] = amd_bytealign_S (w[21], w[22], offset); + w[22] = amd_bytealign_S (w[20], w[21], offset); + w[21] = amd_bytealign_S (w[19], w[20], offset); + w[20] = amd_bytealign_S (w[18], w[19], offset); + w[19] = amd_bytealign_S (w[17], w[18], offset); + w[18] = amd_bytealign_S (w[16], w[17], offset); + w[17] = amd_bytealign_S (w[15], w[16], offset); + w[16] = amd_bytealign_S (w[14], w[15], offset); + w[15] = amd_bytealign_S (w[13], w[14], offset); + w[14] = amd_bytealign_S (w[12], w[13], offset); + w[13] = amd_bytealign_S (w[11], w[12], offset); + w[12] = amd_bytealign_S (w[10], w[11], offset); + w[11] = amd_bytealign_S (w[ 9], w[10], offset); + w[10] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[ 9] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[ 8] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[ 7] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[ 6] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[ 5] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[ 4] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 3] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 2] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 1] = amd_bytealign_S ( 0, w[ 0], offset); w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[ 1] = w[ 2]; - w[ 2] = w[ 3]; - w[ 3] = w[ 4]; - w[ 4] = w[ 5]; - w[ 5] = w[ 6]; - w[ 6] = w[ 7]; - w[ 7] = w[ 8]; - w[ 8] = w[ 9]; - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 2: - w[63] = amd_bytealign_S (w[61], w[60], offset_minus_4); - w[62] = amd_bytealign_S (w[60], w[59], offset_minus_4); - w[61] = amd_bytealign_S (w[59], w[58], offset_minus_4); - w[60] = amd_bytealign_S (w[58], w[57], offset_minus_4); - w[59] = amd_bytealign_S (w[57], w[56], offset_minus_4); - w[58] = amd_bytealign_S (w[56], w[55], offset_minus_4); - w[57] = amd_bytealign_S (w[55], w[54], offset_minus_4); - w[56] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[55] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[54] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[53] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[52] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[51] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[50] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[49] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[48] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[47] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[46] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[45] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[44] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[43] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[42] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[41] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[40] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[39] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[38] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[37] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[36] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[35] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[34] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[33] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[32] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[31] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[30] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[29] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[28] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[27] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[26] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[25] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[24] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[23] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[22] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[21] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[20] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[19] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[18] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[17] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[16] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[15] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[14] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[13] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[12] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[11] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[10] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[ 8] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[ 7] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[ 6] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[ 5] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[ 4] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[ 3] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 2] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[60], w[61], offset); + w[62] = amd_bytealign_S (w[59], w[60], offset); + w[61] = amd_bytealign_S (w[58], w[59], offset); + w[60] = amd_bytealign_S (w[57], w[58], offset); + w[59] = amd_bytealign_S (w[56], w[57], offset); + w[58] = amd_bytealign_S (w[55], w[56], offset); + w[57] = amd_bytealign_S (w[54], w[55], offset); + w[56] = amd_bytealign_S (w[53], w[54], offset); + w[55] = amd_bytealign_S (w[52], w[53], offset); + w[54] = amd_bytealign_S (w[51], w[52], offset); + w[53] = amd_bytealign_S (w[50], w[51], offset); + w[52] = amd_bytealign_S (w[49], w[50], offset); + w[51] = amd_bytealign_S (w[48], w[49], offset); + w[50] = amd_bytealign_S (w[47], w[48], offset); + w[49] = amd_bytealign_S (w[46], w[47], offset); + w[48] = amd_bytealign_S (w[45], w[46], offset); + w[47] = amd_bytealign_S (w[44], w[45], offset); + w[46] = amd_bytealign_S (w[43], w[44], offset); + w[45] = amd_bytealign_S (w[42], w[43], offset); + w[44] = amd_bytealign_S (w[41], w[42], offset); + w[43] = amd_bytealign_S (w[40], w[41], offset); + w[42] = amd_bytealign_S (w[39], w[40], offset); + w[41] = amd_bytealign_S (w[38], w[39], offset); + w[40] = amd_bytealign_S (w[37], w[38], offset); + w[39] = amd_bytealign_S (w[36], w[37], offset); + w[38] = amd_bytealign_S (w[35], w[36], offset); + w[37] = amd_bytealign_S (w[34], w[35], offset); + w[36] = amd_bytealign_S (w[33], w[34], offset); + w[35] = amd_bytealign_S (w[32], w[33], offset); + w[34] = amd_bytealign_S (w[31], w[32], offset); + w[33] = amd_bytealign_S (w[30], w[31], offset); + w[32] = amd_bytealign_S (w[29], w[30], offset); + w[31] = amd_bytealign_S (w[28], w[29], offset); + w[30] = amd_bytealign_S (w[27], w[28], offset); + w[29] = amd_bytealign_S (w[26], w[27], offset); + w[28] = amd_bytealign_S (w[25], w[26], offset); + w[27] = amd_bytealign_S (w[24], w[25], offset); + w[26] = amd_bytealign_S (w[23], w[24], offset); + w[25] = amd_bytealign_S (w[22], w[23], offset); + w[24] = amd_bytealign_S (w[21], w[22], offset); + w[23] = amd_bytealign_S (w[20], w[21], offset); + w[22] = amd_bytealign_S (w[19], w[20], offset); + w[21] = amd_bytealign_S (w[18], w[19], offset); + w[20] = amd_bytealign_S (w[17], w[18], offset); + w[19] = amd_bytealign_S (w[16], w[17], offset); + w[18] = amd_bytealign_S (w[15], w[16], offset); + w[17] = amd_bytealign_S (w[14], w[15], offset); + w[16] = amd_bytealign_S (w[13], w[14], offset); + w[15] = amd_bytealign_S (w[12], w[13], offset); + w[14] = amd_bytealign_S (w[11], w[12], offset); + w[13] = amd_bytealign_S (w[10], w[11], offset); + w[12] = amd_bytealign_S (w[ 9], w[10], offset); + w[11] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[10] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[ 9] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[ 8] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[ 7] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[ 6] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[ 5] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 4] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 3] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 2] = amd_bytealign_S ( 0, w[ 0], offset); w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[ 2] = w[ 3]; - w[ 3] = w[ 4]; - w[ 4] = w[ 5]; - w[ 5] = w[ 6]; - w[ 6] = w[ 7]; - w[ 7] = w[ 8]; - w[ 8] = w[ 9]; - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 3: - w[63] = amd_bytealign_S (w[60], w[59], offset_minus_4); - w[62] = amd_bytealign_S (w[59], w[58], offset_minus_4); - w[61] = amd_bytealign_S (w[58], w[57], offset_minus_4); - w[60] = amd_bytealign_S (w[57], w[56], offset_minus_4); - w[59] = amd_bytealign_S (w[56], w[55], offset_minus_4); - w[58] = amd_bytealign_S (w[55], w[54], offset_minus_4); - w[57] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[56] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[55] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[54] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[53] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[52] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[51] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[50] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[49] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[48] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[47] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[46] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[45] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[44] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[43] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[42] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[41] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[40] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[39] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[38] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[37] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[36] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[35] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[34] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[33] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[32] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[31] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[30] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[29] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[28] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[27] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[26] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[25] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[24] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[23] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[22] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[21] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[20] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[19] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[18] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[17] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[16] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[15] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[14] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[13] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[12] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[11] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[10] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[ 8] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[ 7] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[ 6] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[ 5] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[ 4] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 3] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[59], w[60], offset); + w[62] = amd_bytealign_S (w[58], w[59], offset); + w[61] = amd_bytealign_S (w[57], w[58], offset); + w[60] = amd_bytealign_S (w[56], w[57], offset); + w[59] = amd_bytealign_S (w[55], w[56], offset); + w[58] = amd_bytealign_S (w[54], w[55], offset); + w[57] = amd_bytealign_S (w[53], w[54], offset); + w[56] = amd_bytealign_S (w[52], w[53], offset); + w[55] = amd_bytealign_S (w[51], w[52], offset); + w[54] = amd_bytealign_S (w[50], w[51], offset); + w[53] = amd_bytealign_S (w[49], w[50], offset); + w[52] = amd_bytealign_S (w[48], w[49], offset); + w[51] = amd_bytealign_S (w[47], w[48], offset); + w[50] = amd_bytealign_S (w[46], w[47], offset); + w[49] = amd_bytealign_S (w[45], w[46], offset); + w[48] = amd_bytealign_S (w[44], w[45], offset); + w[47] = amd_bytealign_S (w[43], w[44], offset); + w[46] = amd_bytealign_S (w[42], w[43], offset); + w[45] = amd_bytealign_S (w[41], w[42], offset); + w[44] = amd_bytealign_S (w[40], w[41], offset); + w[43] = amd_bytealign_S (w[39], w[40], offset); + w[42] = amd_bytealign_S (w[38], w[39], offset); + w[41] = amd_bytealign_S (w[37], w[38], offset); + w[40] = amd_bytealign_S (w[36], w[37], offset); + w[39] = amd_bytealign_S (w[35], w[36], offset); + w[38] = amd_bytealign_S (w[34], w[35], offset); + w[37] = amd_bytealign_S (w[33], w[34], offset); + w[36] = amd_bytealign_S (w[32], w[33], offset); + w[35] = amd_bytealign_S (w[31], w[32], offset); + w[34] = amd_bytealign_S (w[30], w[31], offset); + w[33] = amd_bytealign_S (w[29], w[30], offset); + w[32] = amd_bytealign_S (w[28], w[29], offset); + w[31] = amd_bytealign_S (w[27], w[28], offset); + w[30] = amd_bytealign_S (w[26], w[27], offset); + w[29] = amd_bytealign_S (w[25], w[26], offset); + w[28] = amd_bytealign_S (w[24], w[25], offset); + w[27] = amd_bytealign_S (w[23], w[24], offset); + w[26] = amd_bytealign_S (w[22], w[23], offset); + w[25] = amd_bytealign_S (w[21], w[22], offset); + w[24] = amd_bytealign_S (w[20], w[21], offset); + w[23] = amd_bytealign_S (w[19], w[20], offset); + w[22] = amd_bytealign_S (w[18], w[19], offset); + w[21] = amd_bytealign_S (w[17], w[18], offset); + w[20] = amd_bytealign_S (w[16], w[17], offset); + w[19] = amd_bytealign_S (w[15], w[16], offset); + w[18] = amd_bytealign_S (w[14], w[15], offset); + w[17] = amd_bytealign_S (w[13], w[14], offset); + w[16] = amd_bytealign_S (w[12], w[13], offset); + w[15] = amd_bytealign_S (w[11], w[12], offset); + w[14] = amd_bytealign_S (w[10], w[11], offset); + w[13] = amd_bytealign_S (w[ 9], w[10], offset); + w[12] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[11] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[10] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[ 9] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[ 8] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[ 7] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[ 6] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 5] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 4] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 3] = amd_bytealign_S ( 0, w[ 0], offset); w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[ 3] = w[ 4]; - w[ 4] = w[ 5]; - w[ 5] = w[ 6]; - w[ 6] = w[ 7]; - w[ 7] = w[ 8]; - w[ 8] = w[ 9]; - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 4: - w[63] = amd_bytealign_S (w[59], w[58], offset_minus_4); - w[62] = amd_bytealign_S (w[58], w[57], offset_minus_4); - w[61] = amd_bytealign_S (w[57], w[56], offset_minus_4); - w[60] = amd_bytealign_S (w[56], w[55], offset_minus_4); - w[59] = amd_bytealign_S (w[55], w[54], offset_minus_4); - w[58] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[57] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[56] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[55] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[54] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[53] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[52] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[51] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[50] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[49] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[48] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[47] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[46] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[45] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[44] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[43] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[42] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[41] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[40] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[39] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[38] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[37] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[36] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[35] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[34] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[33] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[32] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[31] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[30] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[29] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[28] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[27] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[26] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[25] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[24] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[23] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[22] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[21] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[20] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[19] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[18] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[17] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[16] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[15] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[14] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[13] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[12] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[11] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[10] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[ 8] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[ 7] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[ 6] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[ 5] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 4] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[58], w[59], offset); + w[62] = amd_bytealign_S (w[57], w[58], offset); + w[61] = amd_bytealign_S (w[56], w[57], offset); + w[60] = amd_bytealign_S (w[55], w[56], offset); + w[59] = amd_bytealign_S (w[54], w[55], offset); + w[58] = amd_bytealign_S (w[53], w[54], offset); + w[57] = amd_bytealign_S (w[52], w[53], offset); + w[56] = amd_bytealign_S (w[51], w[52], offset); + w[55] = amd_bytealign_S (w[50], w[51], offset); + w[54] = amd_bytealign_S (w[49], w[50], offset); + w[53] = amd_bytealign_S (w[48], w[49], offset); + w[52] = amd_bytealign_S (w[47], w[48], offset); + w[51] = amd_bytealign_S (w[46], w[47], offset); + w[50] = amd_bytealign_S (w[45], w[46], offset); + w[49] = amd_bytealign_S (w[44], w[45], offset); + w[48] = amd_bytealign_S (w[43], w[44], offset); + w[47] = amd_bytealign_S (w[42], w[43], offset); + w[46] = amd_bytealign_S (w[41], w[42], offset); + w[45] = amd_bytealign_S (w[40], w[41], offset); + w[44] = amd_bytealign_S (w[39], w[40], offset); + w[43] = amd_bytealign_S (w[38], w[39], offset); + w[42] = amd_bytealign_S (w[37], w[38], offset); + w[41] = amd_bytealign_S (w[36], w[37], offset); + w[40] = amd_bytealign_S (w[35], w[36], offset); + w[39] = amd_bytealign_S (w[34], w[35], offset); + w[38] = amd_bytealign_S (w[33], w[34], offset); + w[37] = amd_bytealign_S (w[32], w[33], offset); + w[36] = amd_bytealign_S (w[31], w[32], offset); + w[35] = amd_bytealign_S (w[30], w[31], offset); + w[34] = amd_bytealign_S (w[29], w[30], offset); + w[33] = amd_bytealign_S (w[28], w[29], offset); + w[32] = amd_bytealign_S (w[27], w[28], offset); + w[31] = amd_bytealign_S (w[26], w[27], offset); + w[30] = amd_bytealign_S (w[25], w[26], offset); + w[29] = amd_bytealign_S (w[24], w[25], offset); + w[28] = amd_bytealign_S (w[23], w[24], offset); + w[27] = amd_bytealign_S (w[22], w[23], offset); + w[26] = amd_bytealign_S (w[21], w[22], offset); + w[25] = amd_bytealign_S (w[20], w[21], offset); + w[24] = amd_bytealign_S (w[19], w[20], offset); + w[23] = amd_bytealign_S (w[18], w[19], offset); + w[22] = amd_bytealign_S (w[17], w[18], offset); + w[21] = amd_bytealign_S (w[16], w[17], offset); + w[20] = amd_bytealign_S (w[15], w[16], offset); + w[19] = amd_bytealign_S (w[14], w[15], offset); + w[18] = amd_bytealign_S (w[13], w[14], offset); + w[17] = amd_bytealign_S (w[12], w[13], offset); + w[16] = amd_bytealign_S (w[11], w[12], offset); + w[15] = amd_bytealign_S (w[10], w[11], offset); + w[14] = amd_bytealign_S (w[ 9], w[10], offset); + w[13] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[12] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[11] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[10] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[ 9] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[ 8] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[ 7] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 6] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 5] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 4] = amd_bytealign_S ( 0, w[ 0], offset); w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[ 4] = w[ 5]; - w[ 5] = w[ 6]; - w[ 6] = w[ 7]; - w[ 7] = w[ 8]; - w[ 8] = w[ 9]; - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 5: - w[63] = amd_bytealign_S (w[58], w[57], offset_minus_4); - w[62] = amd_bytealign_S (w[57], w[56], offset_minus_4); - w[61] = amd_bytealign_S (w[56], w[55], offset_minus_4); - w[60] = amd_bytealign_S (w[55], w[54], offset_minus_4); - w[59] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[58] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[57] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[56] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[55] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[54] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[53] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[52] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[51] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[50] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[49] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[48] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[47] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[46] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[45] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[44] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[43] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[42] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[41] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[40] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[39] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[38] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[37] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[36] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[35] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[34] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[33] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[32] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[31] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[30] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[29] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[28] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[27] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[26] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[25] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[24] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[23] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[22] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[21] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[20] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[19] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[18] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[17] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[16] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[15] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[14] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[13] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[12] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[11] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[10] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[ 8] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[ 7] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[ 6] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 5] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[57], w[58], offset); + w[62] = amd_bytealign_S (w[56], w[57], offset); + w[61] = amd_bytealign_S (w[55], w[56], offset); + w[60] = amd_bytealign_S (w[54], w[55], offset); + w[59] = amd_bytealign_S (w[53], w[54], offset); + w[58] = amd_bytealign_S (w[52], w[53], offset); + w[57] = amd_bytealign_S (w[51], w[52], offset); + w[56] = amd_bytealign_S (w[50], w[51], offset); + w[55] = amd_bytealign_S (w[49], w[50], offset); + w[54] = amd_bytealign_S (w[48], w[49], offset); + w[53] = amd_bytealign_S (w[47], w[48], offset); + w[52] = amd_bytealign_S (w[46], w[47], offset); + w[51] = amd_bytealign_S (w[45], w[46], offset); + w[50] = amd_bytealign_S (w[44], w[45], offset); + w[49] = amd_bytealign_S (w[43], w[44], offset); + w[48] = amd_bytealign_S (w[42], w[43], offset); + w[47] = amd_bytealign_S (w[41], w[42], offset); + w[46] = amd_bytealign_S (w[40], w[41], offset); + w[45] = amd_bytealign_S (w[39], w[40], offset); + w[44] = amd_bytealign_S (w[38], w[39], offset); + w[43] = amd_bytealign_S (w[37], w[38], offset); + w[42] = amd_bytealign_S (w[36], w[37], offset); + w[41] = amd_bytealign_S (w[35], w[36], offset); + w[40] = amd_bytealign_S (w[34], w[35], offset); + w[39] = amd_bytealign_S (w[33], w[34], offset); + w[38] = amd_bytealign_S (w[32], w[33], offset); + w[37] = amd_bytealign_S (w[31], w[32], offset); + w[36] = amd_bytealign_S (w[30], w[31], offset); + w[35] = amd_bytealign_S (w[29], w[30], offset); + w[34] = amd_bytealign_S (w[28], w[29], offset); + w[33] = amd_bytealign_S (w[27], w[28], offset); + w[32] = amd_bytealign_S (w[26], w[27], offset); + w[31] = amd_bytealign_S (w[25], w[26], offset); + w[30] = amd_bytealign_S (w[24], w[25], offset); + w[29] = amd_bytealign_S (w[23], w[24], offset); + w[28] = amd_bytealign_S (w[22], w[23], offset); + w[27] = amd_bytealign_S (w[21], w[22], offset); + w[26] = amd_bytealign_S (w[20], w[21], offset); + w[25] = amd_bytealign_S (w[19], w[20], offset); + w[24] = amd_bytealign_S (w[18], w[19], offset); + w[23] = amd_bytealign_S (w[17], w[18], offset); + w[22] = amd_bytealign_S (w[16], w[17], offset); + w[21] = amd_bytealign_S (w[15], w[16], offset); + w[20] = amd_bytealign_S (w[14], w[15], offset); + w[19] = amd_bytealign_S (w[13], w[14], offset); + w[18] = amd_bytealign_S (w[12], w[13], offset); + w[17] = amd_bytealign_S (w[11], w[12], offset); + w[16] = amd_bytealign_S (w[10], w[11], offset); + w[15] = amd_bytealign_S (w[ 9], w[10], offset); + w[14] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[13] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[12] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[11] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[10] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[ 9] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[ 8] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 7] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 6] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 5] = amd_bytealign_S ( 0, w[ 0], offset); w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[ 5] = w[ 6]; - w[ 6] = w[ 7]; - w[ 7] = w[ 8]; - w[ 8] = w[ 9]; - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 6: - w[63] = amd_bytealign_S (w[57], w[56], offset_minus_4); - w[62] = amd_bytealign_S (w[56], w[55], offset_minus_4); - w[61] = amd_bytealign_S (w[55], w[54], offset_minus_4); - w[60] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[59] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[58] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[57] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[56] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[55] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[54] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[53] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[52] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[51] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[50] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[49] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[48] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[47] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[46] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[45] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[44] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[43] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[42] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[41] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[40] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[39] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[38] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[37] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[36] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[35] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[34] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[33] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[32] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[31] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[30] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[29] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[28] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[27] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[26] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[25] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[24] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[23] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[22] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[21] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[20] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[19] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[18] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[17] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[16] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[15] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[14] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[13] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[12] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[11] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[10] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[ 8] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[ 7] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 6] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[56], w[57], offset); + w[62] = amd_bytealign_S (w[55], w[56], offset); + w[61] = amd_bytealign_S (w[54], w[55], offset); + w[60] = amd_bytealign_S (w[53], w[54], offset); + w[59] = amd_bytealign_S (w[52], w[53], offset); + w[58] = amd_bytealign_S (w[51], w[52], offset); + w[57] = amd_bytealign_S (w[50], w[51], offset); + w[56] = amd_bytealign_S (w[49], w[50], offset); + w[55] = amd_bytealign_S (w[48], w[49], offset); + w[54] = amd_bytealign_S (w[47], w[48], offset); + w[53] = amd_bytealign_S (w[46], w[47], offset); + w[52] = amd_bytealign_S (w[45], w[46], offset); + w[51] = amd_bytealign_S (w[44], w[45], offset); + w[50] = amd_bytealign_S (w[43], w[44], offset); + w[49] = amd_bytealign_S (w[42], w[43], offset); + w[48] = amd_bytealign_S (w[41], w[42], offset); + w[47] = amd_bytealign_S (w[40], w[41], offset); + w[46] = amd_bytealign_S (w[39], w[40], offset); + w[45] = amd_bytealign_S (w[38], w[39], offset); + w[44] = amd_bytealign_S (w[37], w[38], offset); + w[43] = amd_bytealign_S (w[36], w[37], offset); + w[42] = amd_bytealign_S (w[35], w[36], offset); + w[41] = amd_bytealign_S (w[34], w[35], offset); + w[40] = amd_bytealign_S (w[33], w[34], offset); + w[39] = amd_bytealign_S (w[32], w[33], offset); + w[38] = amd_bytealign_S (w[31], w[32], offset); + w[37] = amd_bytealign_S (w[30], w[31], offset); + w[36] = amd_bytealign_S (w[29], w[30], offset); + w[35] = amd_bytealign_S (w[28], w[29], offset); + w[34] = amd_bytealign_S (w[27], w[28], offset); + w[33] = amd_bytealign_S (w[26], w[27], offset); + w[32] = amd_bytealign_S (w[25], w[26], offset); + w[31] = amd_bytealign_S (w[24], w[25], offset); + w[30] = amd_bytealign_S (w[23], w[24], offset); + w[29] = amd_bytealign_S (w[22], w[23], offset); + w[28] = amd_bytealign_S (w[21], w[22], offset); + w[27] = amd_bytealign_S (w[20], w[21], offset); + w[26] = amd_bytealign_S (w[19], w[20], offset); + w[25] = amd_bytealign_S (w[18], w[19], offset); + w[24] = amd_bytealign_S (w[17], w[18], offset); + w[23] = amd_bytealign_S (w[16], w[17], offset); + w[22] = amd_bytealign_S (w[15], w[16], offset); + w[21] = amd_bytealign_S (w[14], w[15], offset); + w[20] = amd_bytealign_S (w[13], w[14], offset); + w[19] = amd_bytealign_S (w[12], w[13], offset); + w[18] = amd_bytealign_S (w[11], w[12], offset); + w[17] = amd_bytealign_S (w[10], w[11], offset); + w[16] = amd_bytealign_S (w[ 9], w[10], offset); + w[15] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[14] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[13] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[12] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[11] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[10] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[ 9] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 8] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 7] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 6] = amd_bytealign_S ( 0, w[ 0], offset); w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; @@ -26462,128 +44591,66 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[ 6] = w[ 7]; - w[ 7] = w[ 8]; - w[ 8] = w[ 9]; - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 7: - w[63] = amd_bytealign_S (w[56], w[55], offset_minus_4); - w[62] = amd_bytealign_S (w[55], w[54], offset_minus_4); - w[61] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[60] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[59] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[58] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[57] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[56] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[55] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[54] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[53] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[52] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[51] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[50] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[49] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[48] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[47] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[46] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[45] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[44] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[43] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[42] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[41] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[40] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[39] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[38] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[37] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[36] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[35] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[34] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[33] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[32] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[31] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[30] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[29] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[28] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[27] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[26] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[25] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[24] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[23] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[22] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[21] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[20] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[19] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[18] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[17] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[16] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[15] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[14] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[13] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[12] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[11] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[10] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[ 8] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 7] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[55], w[56], offset); + w[62] = amd_bytealign_S (w[54], w[55], offset); + w[61] = amd_bytealign_S (w[53], w[54], offset); + w[60] = amd_bytealign_S (w[52], w[53], offset); + w[59] = amd_bytealign_S (w[51], w[52], offset); + w[58] = amd_bytealign_S (w[50], w[51], offset); + w[57] = amd_bytealign_S (w[49], w[50], offset); + w[56] = amd_bytealign_S (w[48], w[49], offset); + w[55] = amd_bytealign_S (w[47], w[48], offset); + w[54] = amd_bytealign_S (w[46], w[47], offset); + w[53] = amd_bytealign_S (w[45], w[46], offset); + w[52] = amd_bytealign_S (w[44], w[45], offset); + w[51] = amd_bytealign_S (w[43], w[44], offset); + w[50] = amd_bytealign_S (w[42], w[43], offset); + w[49] = amd_bytealign_S (w[41], w[42], offset); + w[48] = amd_bytealign_S (w[40], w[41], offset); + w[47] = amd_bytealign_S (w[39], w[40], offset); + w[46] = amd_bytealign_S (w[38], w[39], offset); + w[45] = amd_bytealign_S (w[37], w[38], offset); + w[44] = amd_bytealign_S (w[36], w[37], offset); + w[43] = amd_bytealign_S (w[35], w[36], offset); + w[42] = amd_bytealign_S (w[34], w[35], offset); + w[41] = amd_bytealign_S (w[33], w[34], offset); + w[40] = amd_bytealign_S (w[32], w[33], offset); + w[39] = amd_bytealign_S (w[31], w[32], offset); + w[38] = amd_bytealign_S (w[30], w[31], offset); + w[37] = amd_bytealign_S (w[29], w[30], offset); + w[36] = amd_bytealign_S (w[28], w[29], offset); + w[35] = amd_bytealign_S (w[27], w[28], offset); + w[34] = amd_bytealign_S (w[26], w[27], offset); + w[33] = amd_bytealign_S (w[25], w[26], offset); + w[32] = amd_bytealign_S (w[24], w[25], offset); + w[31] = amd_bytealign_S (w[23], w[24], offset); + w[30] = amd_bytealign_S (w[22], w[23], offset); + w[29] = amd_bytealign_S (w[21], w[22], offset); + w[28] = amd_bytealign_S (w[20], w[21], offset); + w[27] = amd_bytealign_S (w[19], w[20], offset); + w[26] = amd_bytealign_S (w[18], w[19], offset); + w[25] = amd_bytealign_S (w[17], w[18], offset); + w[24] = amd_bytealign_S (w[16], w[17], offset); + w[23] = amd_bytealign_S (w[15], w[16], offset); + w[22] = amd_bytealign_S (w[14], w[15], offset); + w[21] = amd_bytealign_S (w[13], w[14], offset); + w[20] = amd_bytealign_S (w[12], w[13], offset); + w[19] = amd_bytealign_S (w[11], w[12], offset); + w[18] = amd_bytealign_S (w[10], w[11], offset); + w[17] = amd_bytealign_S (w[ 9], w[10], offset); + w[16] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[15] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[14] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[13] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[12] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[11] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[10] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 9] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 8] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 7] = amd_bytealign_S ( 0, w[ 0], offset); w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; @@ -26592,126 +44659,65 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[ 7] = w[ 8]; - w[ 8] = w[ 9]; - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 8: - w[63] = amd_bytealign_S (w[55], w[54], offset_minus_4); - w[62] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[61] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[60] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[59] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[58] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[57] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[56] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[55] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[54] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[53] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[52] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[51] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[50] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[49] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[48] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[47] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[46] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[45] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[44] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[43] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[42] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[41] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[40] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[39] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[38] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[37] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[36] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[35] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[34] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[33] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[32] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[31] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[30] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[29] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[28] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[27] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[26] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[25] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[24] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[23] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[22] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[21] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[20] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[19] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[18] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[17] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[16] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[15] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[14] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[13] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[12] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[11] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[10] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 8] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[54], w[55], offset); + w[62] = amd_bytealign_S (w[53], w[54], offset); + w[61] = amd_bytealign_S (w[52], w[53], offset); + w[60] = amd_bytealign_S (w[51], w[52], offset); + w[59] = amd_bytealign_S (w[50], w[51], offset); + w[58] = amd_bytealign_S (w[49], w[50], offset); + w[57] = amd_bytealign_S (w[48], w[49], offset); + w[56] = amd_bytealign_S (w[47], w[48], offset); + w[55] = amd_bytealign_S (w[46], w[47], offset); + w[54] = amd_bytealign_S (w[45], w[46], offset); + w[53] = amd_bytealign_S (w[44], w[45], offset); + w[52] = amd_bytealign_S (w[43], w[44], offset); + w[51] = amd_bytealign_S (w[42], w[43], offset); + w[50] = amd_bytealign_S (w[41], w[42], offset); + w[49] = amd_bytealign_S (w[40], w[41], offset); + w[48] = amd_bytealign_S (w[39], w[40], offset); + w[47] = amd_bytealign_S (w[38], w[39], offset); + w[46] = amd_bytealign_S (w[37], w[38], offset); + w[45] = amd_bytealign_S (w[36], w[37], offset); + w[44] = amd_bytealign_S (w[35], w[36], offset); + w[43] = amd_bytealign_S (w[34], w[35], offset); + w[42] = amd_bytealign_S (w[33], w[34], offset); + w[41] = amd_bytealign_S (w[32], w[33], offset); + w[40] = amd_bytealign_S (w[31], w[32], offset); + w[39] = amd_bytealign_S (w[30], w[31], offset); + w[38] = amd_bytealign_S (w[29], w[30], offset); + w[37] = amd_bytealign_S (w[28], w[29], offset); + w[36] = amd_bytealign_S (w[27], w[28], offset); + w[35] = amd_bytealign_S (w[26], w[27], offset); + w[34] = amd_bytealign_S (w[25], w[26], offset); + w[33] = amd_bytealign_S (w[24], w[25], offset); + w[32] = amd_bytealign_S (w[23], w[24], offset); + w[31] = amd_bytealign_S (w[22], w[23], offset); + w[30] = amd_bytealign_S (w[21], w[22], offset); + w[29] = amd_bytealign_S (w[20], w[21], offset); + w[28] = amd_bytealign_S (w[19], w[20], offset); + w[27] = amd_bytealign_S (w[18], w[19], offset); + w[26] = amd_bytealign_S (w[17], w[18], offset); + w[25] = amd_bytealign_S (w[16], w[17], offset); + w[24] = amd_bytealign_S (w[15], w[16], offset); + w[23] = amd_bytealign_S (w[14], w[15], offset); + w[22] = amd_bytealign_S (w[13], w[14], offset); + w[21] = amd_bytealign_S (w[12], w[13], offset); + w[20] = amd_bytealign_S (w[11], w[12], offset); + w[19] = amd_bytealign_S (w[10], w[11], offset); + w[18] = amd_bytealign_S (w[ 9], w[10], offset); + w[17] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[16] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[15] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[14] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[13] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[12] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[11] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[10] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 9] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 8] = amd_bytealign_S ( 0, w[ 0], offset); w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; @@ -26721,124 +44727,64 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[ 8] = w[ 9]; - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 9: - w[63] = amd_bytealign_S (w[54], w[53], offset_minus_4); - w[62] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[61] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[60] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[59] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[58] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[57] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[56] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[55] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[54] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[53] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[52] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[51] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[50] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[49] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[48] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[47] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[46] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[45] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[44] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[43] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[42] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[41] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[40] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[39] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[38] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[37] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[36] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[35] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[34] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[33] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[32] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[31] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[30] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[29] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[28] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[27] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[26] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[25] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[24] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[23] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[22] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[21] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[20] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[19] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[18] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[17] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[16] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[15] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[14] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[13] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[12] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[11] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[10] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[ 9] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[53], w[54], offset); + w[62] = amd_bytealign_S (w[52], w[53], offset); + w[61] = amd_bytealign_S (w[51], w[52], offset); + w[60] = amd_bytealign_S (w[50], w[51], offset); + w[59] = amd_bytealign_S (w[49], w[50], offset); + w[58] = amd_bytealign_S (w[48], w[49], offset); + w[57] = amd_bytealign_S (w[47], w[48], offset); + w[56] = amd_bytealign_S (w[46], w[47], offset); + w[55] = amd_bytealign_S (w[45], w[46], offset); + w[54] = amd_bytealign_S (w[44], w[45], offset); + w[53] = amd_bytealign_S (w[43], w[44], offset); + w[52] = amd_bytealign_S (w[42], w[43], offset); + w[51] = amd_bytealign_S (w[41], w[42], offset); + w[50] = amd_bytealign_S (w[40], w[41], offset); + w[49] = amd_bytealign_S (w[39], w[40], offset); + w[48] = amd_bytealign_S (w[38], w[39], offset); + w[47] = amd_bytealign_S (w[37], w[38], offset); + w[46] = amd_bytealign_S (w[36], w[37], offset); + w[45] = amd_bytealign_S (w[35], w[36], offset); + w[44] = amd_bytealign_S (w[34], w[35], offset); + w[43] = amd_bytealign_S (w[33], w[34], offset); + w[42] = amd_bytealign_S (w[32], w[33], offset); + w[41] = amd_bytealign_S (w[31], w[32], offset); + w[40] = amd_bytealign_S (w[30], w[31], offset); + w[39] = amd_bytealign_S (w[29], w[30], offset); + w[38] = amd_bytealign_S (w[28], w[29], offset); + w[37] = amd_bytealign_S (w[27], w[28], offset); + w[36] = amd_bytealign_S (w[26], w[27], offset); + w[35] = amd_bytealign_S (w[25], w[26], offset); + w[34] = amd_bytealign_S (w[24], w[25], offset); + w[33] = amd_bytealign_S (w[23], w[24], offset); + w[32] = amd_bytealign_S (w[22], w[23], offset); + w[31] = amd_bytealign_S (w[21], w[22], offset); + w[30] = amd_bytealign_S (w[20], w[21], offset); + w[29] = amd_bytealign_S (w[19], w[20], offset); + w[28] = amd_bytealign_S (w[18], w[19], offset); + w[27] = amd_bytealign_S (w[17], w[18], offset); + w[26] = amd_bytealign_S (w[16], w[17], offset); + w[25] = amd_bytealign_S (w[15], w[16], offset); + w[24] = amd_bytealign_S (w[14], w[15], offset); + w[23] = amd_bytealign_S (w[13], w[14], offset); + w[22] = amd_bytealign_S (w[12], w[13], offset); + w[21] = amd_bytealign_S (w[11], w[12], offset); + w[20] = amd_bytealign_S (w[10], w[11], offset); + w[19] = amd_bytealign_S (w[ 9], w[10], offset); + w[18] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[17] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[16] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[15] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[14] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[13] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[12] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[11] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[10] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 9] = amd_bytealign_S ( 0, w[ 0], offset); w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; @@ -26849,122 +44795,63 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[ 9] = w[10]; - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 10: - w[63] = amd_bytealign_S (w[53], w[52], offset_minus_4); - w[62] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[61] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[60] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[59] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[58] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[57] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[56] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[55] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[54] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[53] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[52] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[51] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[50] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[49] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[48] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[47] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[46] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[45] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[44] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[43] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[42] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[41] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[40] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[39] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[38] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[37] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[36] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[35] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[34] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[33] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[32] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[31] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[30] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[29] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[28] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[27] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[26] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[25] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[24] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[23] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[22] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[21] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[20] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[19] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[18] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[17] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[16] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[15] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[14] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[13] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[12] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[11] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[10] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[52], w[53], offset); + w[62] = amd_bytealign_S (w[51], w[52], offset); + w[61] = amd_bytealign_S (w[50], w[51], offset); + w[60] = amd_bytealign_S (w[49], w[50], offset); + w[59] = amd_bytealign_S (w[48], w[49], offset); + w[58] = amd_bytealign_S (w[47], w[48], offset); + w[57] = amd_bytealign_S (w[46], w[47], offset); + w[56] = amd_bytealign_S (w[45], w[46], offset); + w[55] = amd_bytealign_S (w[44], w[45], offset); + w[54] = amd_bytealign_S (w[43], w[44], offset); + w[53] = amd_bytealign_S (w[42], w[43], offset); + w[52] = amd_bytealign_S (w[41], w[42], offset); + w[51] = amd_bytealign_S (w[40], w[41], offset); + w[50] = amd_bytealign_S (w[39], w[40], offset); + w[49] = amd_bytealign_S (w[38], w[39], offset); + w[48] = amd_bytealign_S (w[37], w[38], offset); + w[47] = amd_bytealign_S (w[36], w[37], offset); + w[46] = amd_bytealign_S (w[35], w[36], offset); + w[45] = amd_bytealign_S (w[34], w[35], offset); + w[44] = amd_bytealign_S (w[33], w[34], offset); + w[43] = amd_bytealign_S (w[32], w[33], offset); + w[42] = amd_bytealign_S (w[31], w[32], offset); + w[41] = amd_bytealign_S (w[30], w[31], offset); + w[40] = amd_bytealign_S (w[29], w[30], offset); + w[39] = amd_bytealign_S (w[28], w[29], offset); + w[38] = amd_bytealign_S (w[27], w[28], offset); + w[37] = amd_bytealign_S (w[26], w[27], offset); + w[36] = amd_bytealign_S (w[25], w[26], offset); + w[35] = amd_bytealign_S (w[24], w[25], offset); + w[34] = amd_bytealign_S (w[23], w[24], offset); + w[33] = amd_bytealign_S (w[22], w[23], offset); + w[32] = amd_bytealign_S (w[21], w[22], offset); + w[31] = amd_bytealign_S (w[20], w[21], offset); + w[30] = amd_bytealign_S (w[19], w[20], offset); + w[29] = amd_bytealign_S (w[18], w[19], offset); + w[28] = amd_bytealign_S (w[17], w[18], offset); + w[27] = amd_bytealign_S (w[16], w[17], offset); + w[26] = amd_bytealign_S (w[15], w[16], offset); + w[25] = amd_bytealign_S (w[14], w[15], offset); + w[24] = amd_bytealign_S (w[13], w[14], offset); + w[23] = amd_bytealign_S (w[12], w[13], offset); + w[22] = amd_bytealign_S (w[11], w[12], offset); + w[21] = amd_bytealign_S (w[10], w[11], offset); + w[20] = amd_bytealign_S (w[ 9], w[10], offset); + w[19] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[18] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[17] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[16] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[15] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[14] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[13] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[12] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[11] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[10] = amd_bytealign_S ( 0, w[ 0], offset); w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; @@ -26976,120 +44863,62 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[10] = w[11]; - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 11: - w[63] = amd_bytealign_S (w[52], w[51], offset_minus_4); - w[62] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[61] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[60] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[59] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[58] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[57] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[56] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[55] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[54] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[53] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[52] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[51] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[50] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[49] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[48] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[47] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[46] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[45] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[44] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[43] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[42] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[41] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[40] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[39] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[38] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[37] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[36] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[35] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[34] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[33] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[32] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[31] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[30] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[29] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[28] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[27] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[26] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[25] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[24] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[23] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[22] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[21] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[20] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[19] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[18] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[17] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[16] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[15] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[14] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[13] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[12] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[11] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[51], w[52], offset); + w[62] = amd_bytealign_S (w[50], w[51], offset); + w[61] = amd_bytealign_S (w[49], w[50], offset); + w[60] = amd_bytealign_S (w[48], w[49], offset); + w[59] = amd_bytealign_S (w[47], w[48], offset); + w[58] = amd_bytealign_S (w[46], w[47], offset); + w[57] = amd_bytealign_S (w[45], w[46], offset); + w[56] = amd_bytealign_S (w[44], w[45], offset); + w[55] = amd_bytealign_S (w[43], w[44], offset); + w[54] = amd_bytealign_S (w[42], w[43], offset); + w[53] = amd_bytealign_S (w[41], w[42], offset); + w[52] = amd_bytealign_S (w[40], w[41], offset); + w[51] = amd_bytealign_S (w[39], w[40], offset); + w[50] = amd_bytealign_S (w[38], w[39], offset); + w[49] = amd_bytealign_S (w[37], w[38], offset); + w[48] = amd_bytealign_S (w[36], w[37], offset); + w[47] = amd_bytealign_S (w[35], w[36], offset); + w[46] = amd_bytealign_S (w[34], w[35], offset); + w[45] = amd_bytealign_S (w[33], w[34], offset); + w[44] = amd_bytealign_S (w[32], w[33], offset); + w[43] = amd_bytealign_S (w[31], w[32], offset); + w[42] = amd_bytealign_S (w[30], w[31], offset); + w[41] = amd_bytealign_S (w[29], w[30], offset); + w[40] = amd_bytealign_S (w[28], w[29], offset); + w[39] = amd_bytealign_S (w[27], w[28], offset); + w[38] = amd_bytealign_S (w[26], w[27], offset); + w[37] = amd_bytealign_S (w[25], w[26], offset); + w[36] = amd_bytealign_S (w[24], w[25], offset); + w[35] = amd_bytealign_S (w[23], w[24], offset); + w[34] = amd_bytealign_S (w[22], w[23], offset); + w[33] = amd_bytealign_S (w[21], w[22], offset); + w[32] = amd_bytealign_S (w[20], w[21], offset); + w[31] = amd_bytealign_S (w[19], w[20], offset); + w[30] = amd_bytealign_S (w[18], w[19], offset); + w[29] = amd_bytealign_S (w[17], w[18], offset); + w[28] = amd_bytealign_S (w[16], w[17], offset); + w[27] = amd_bytealign_S (w[15], w[16], offset); + w[26] = amd_bytealign_S (w[14], w[15], offset); + w[25] = amd_bytealign_S (w[13], w[14], offset); + w[24] = amd_bytealign_S (w[12], w[13], offset); + w[23] = amd_bytealign_S (w[11], w[12], offset); + w[22] = amd_bytealign_S (w[10], w[11], offset); + w[21] = amd_bytealign_S (w[ 9], w[10], offset); + w[20] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[19] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[18] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[17] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[16] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[15] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[14] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[13] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[12] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[11] = amd_bytealign_S ( 0, w[ 0], offset); w[10] = 0; w[ 9] = 0; w[ 8] = 0; @@ -27102,118 +44931,61 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[11] = w[12]; - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 12: - w[63] = amd_bytealign_S (w[51], w[50], offset_minus_4); - w[62] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[61] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[60] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[59] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[58] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[57] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[56] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[55] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[54] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[53] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[52] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[51] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[50] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[49] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[48] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[47] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[46] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[45] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[44] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[43] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[42] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[41] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[40] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[39] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[38] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[37] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[36] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[35] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[34] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[33] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[32] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[31] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[30] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[29] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[28] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[27] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[26] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[25] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[24] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[23] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[22] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[21] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[20] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[19] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[18] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[17] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[16] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[15] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[14] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[13] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[12] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[50], w[51], offset); + w[62] = amd_bytealign_S (w[49], w[50], offset); + w[61] = amd_bytealign_S (w[48], w[49], offset); + w[60] = amd_bytealign_S (w[47], w[48], offset); + w[59] = amd_bytealign_S (w[46], w[47], offset); + w[58] = amd_bytealign_S (w[45], w[46], offset); + w[57] = amd_bytealign_S (w[44], w[45], offset); + w[56] = amd_bytealign_S (w[43], w[44], offset); + w[55] = amd_bytealign_S (w[42], w[43], offset); + w[54] = amd_bytealign_S (w[41], w[42], offset); + w[53] = amd_bytealign_S (w[40], w[41], offset); + w[52] = amd_bytealign_S (w[39], w[40], offset); + w[51] = amd_bytealign_S (w[38], w[39], offset); + w[50] = amd_bytealign_S (w[37], w[38], offset); + w[49] = amd_bytealign_S (w[36], w[37], offset); + w[48] = amd_bytealign_S (w[35], w[36], offset); + w[47] = amd_bytealign_S (w[34], w[35], offset); + w[46] = amd_bytealign_S (w[33], w[34], offset); + w[45] = amd_bytealign_S (w[32], w[33], offset); + w[44] = amd_bytealign_S (w[31], w[32], offset); + w[43] = amd_bytealign_S (w[30], w[31], offset); + w[42] = amd_bytealign_S (w[29], w[30], offset); + w[41] = amd_bytealign_S (w[28], w[29], offset); + w[40] = amd_bytealign_S (w[27], w[28], offset); + w[39] = amd_bytealign_S (w[26], w[27], offset); + w[38] = amd_bytealign_S (w[25], w[26], offset); + w[37] = amd_bytealign_S (w[24], w[25], offset); + w[36] = amd_bytealign_S (w[23], w[24], offset); + w[35] = amd_bytealign_S (w[22], w[23], offset); + w[34] = amd_bytealign_S (w[21], w[22], offset); + w[33] = amd_bytealign_S (w[20], w[21], offset); + w[32] = amd_bytealign_S (w[19], w[20], offset); + w[31] = amd_bytealign_S (w[18], w[19], offset); + w[30] = amd_bytealign_S (w[17], w[18], offset); + w[29] = amd_bytealign_S (w[16], w[17], offset); + w[28] = amd_bytealign_S (w[15], w[16], offset); + w[27] = amd_bytealign_S (w[14], w[15], offset); + w[26] = amd_bytealign_S (w[13], w[14], offset); + w[25] = amd_bytealign_S (w[12], w[13], offset); + w[24] = amd_bytealign_S (w[11], w[12], offset); + w[23] = amd_bytealign_S (w[10], w[11], offset); + w[22] = amd_bytealign_S (w[ 9], w[10], offset); + w[21] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[20] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[19] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[18] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[17] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[16] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[15] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[14] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[13] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[12] = amd_bytealign_S ( 0, w[ 0], offset); w[11] = 0; w[10] = 0; w[ 9] = 0; @@ -27227,116 +44999,60 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[12] = w[13]; - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 13: - w[63] = amd_bytealign_S (w[50], w[49], offset_minus_4); - w[62] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[61] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[60] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[59] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[58] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[57] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[56] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[55] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[54] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[53] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[52] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[51] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[50] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[49] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[48] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[47] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[46] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[45] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[44] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[43] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[42] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[41] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[40] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[39] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[38] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[37] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[36] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[35] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[34] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[33] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[32] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[31] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[30] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[29] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[28] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[27] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[26] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[25] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[24] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[23] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[22] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[21] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[20] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[19] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[18] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[17] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[16] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[15] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[14] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[13] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[49], w[50], offset); + w[62] = amd_bytealign_S (w[48], w[49], offset); + w[61] = amd_bytealign_S (w[47], w[48], offset); + w[60] = amd_bytealign_S (w[46], w[47], offset); + w[59] = amd_bytealign_S (w[45], w[46], offset); + w[58] = amd_bytealign_S (w[44], w[45], offset); + w[57] = amd_bytealign_S (w[43], w[44], offset); + w[56] = amd_bytealign_S (w[42], w[43], offset); + w[55] = amd_bytealign_S (w[41], w[42], offset); + w[54] = amd_bytealign_S (w[40], w[41], offset); + w[53] = amd_bytealign_S (w[39], w[40], offset); + w[52] = amd_bytealign_S (w[38], w[39], offset); + w[51] = amd_bytealign_S (w[37], w[38], offset); + w[50] = amd_bytealign_S (w[36], w[37], offset); + w[49] = amd_bytealign_S (w[35], w[36], offset); + w[48] = amd_bytealign_S (w[34], w[35], offset); + w[47] = amd_bytealign_S (w[33], w[34], offset); + w[46] = amd_bytealign_S (w[32], w[33], offset); + w[45] = amd_bytealign_S (w[31], w[32], offset); + w[44] = amd_bytealign_S (w[30], w[31], offset); + w[43] = amd_bytealign_S (w[29], w[30], offset); + w[42] = amd_bytealign_S (w[28], w[29], offset); + w[41] = amd_bytealign_S (w[27], w[28], offset); + w[40] = amd_bytealign_S (w[26], w[27], offset); + w[39] = amd_bytealign_S (w[25], w[26], offset); + w[38] = amd_bytealign_S (w[24], w[25], offset); + w[37] = amd_bytealign_S (w[23], w[24], offset); + w[36] = amd_bytealign_S (w[22], w[23], offset); + w[35] = amd_bytealign_S (w[21], w[22], offset); + w[34] = amd_bytealign_S (w[20], w[21], offset); + w[33] = amd_bytealign_S (w[19], w[20], offset); + w[32] = amd_bytealign_S (w[18], w[19], offset); + w[31] = amd_bytealign_S (w[17], w[18], offset); + w[30] = amd_bytealign_S (w[16], w[17], offset); + w[29] = amd_bytealign_S (w[15], w[16], offset); + w[28] = amd_bytealign_S (w[14], w[15], offset); + w[27] = amd_bytealign_S (w[13], w[14], offset); + w[26] = amd_bytealign_S (w[12], w[13], offset); + w[25] = amd_bytealign_S (w[11], w[12], offset); + w[24] = amd_bytealign_S (w[10], w[11], offset); + w[23] = amd_bytealign_S (w[ 9], w[10], offset); + w[22] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[21] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[20] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[19] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[18] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[17] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[16] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[15] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[14] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[13] = amd_bytealign_S ( 0, w[ 0], offset); w[12] = 0; w[11] = 0; w[10] = 0; @@ -27351,114 +45067,59 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[13] = w[14]; - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 14: - w[63] = amd_bytealign_S (w[49], w[48], offset_minus_4); - w[62] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[61] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[60] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[59] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[58] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[57] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[56] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[55] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[54] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[53] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[52] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[51] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[50] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[49] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[48] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[47] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[46] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[45] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[44] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[43] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[42] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[41] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[40] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[39] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[38] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[37] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[36] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[35] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[34] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[33] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[32] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[31] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[30] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[29] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[28] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[27] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[26] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[25] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[24] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[23] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[22] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[21] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[20] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[19] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[18] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[17] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[16] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[15] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[14] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[48], w[49], offset); + w[62] = amd_bytealign_S (w[47], w[48], offset); + w[61] = amd_bytealign_S (w[46], w[47], offset); + w[60] = amd_bytealign_S (w[45], w[46], offset); + w[59] = amd_bytealign_S (w[44], w[45], offset); + w[58] = amd_bytealign_S (w[43], w[44], offset); + w[57] = amd_bytealign_S (w[42], w[43], offset); + w[56] = amd_bytealign_S (w[41], w[42], offset); + w[55] = amd_bytealign_S (w[40], w[41], offset); + w[54] = amd_bytealign_S (w[39], w[40], offset); + w[53] = amd_bytealign_S (w[38], w[39], offset); + w[52] = amd_bytealign_S (w[37], w[38], offset); + w[51] = amd_bytealign_S (w[36], w[37], offset); + w[50] = amd_bytealign_S (w[35], w[36], offset); + w[49] = amd_bytealign_S (w[34], w[35], offset); + w[48] = amd_bytealign_S (w[33], w[34], offset); + w[47] = amd_bytealign_S (w[32], w[33], offset); + w[46] = amd_bytealign_S (w[31], w[32], offset); + w[45] = amd_bytealign_S (w[30], w[31], offset); + w[44] = amd_bytealign_S (w[29], w[30], offset); + w[43] = amd_bytealign_S (w[28], w[29], offset); + w[42] = amd_bytealign_S (w[27], w[28], offset); + w[41] = amd_bytealign_S (w[26], w[27], offset); + w[40] = amd_bytealign_S (w[25], w[26], offset); + w[39] = amd_bytealign_S (w[24], w[25], offset); + w[38] = amd_bytealign_S (w[23], w[24], offset); + w[37] = amd_bytealign_S (w[22], w[23], offset); + w[36] = amd_bytealign_S (w[21], w[22], offset); + w[35] = amd_bytealign_S (w[20], w[21], offset); + w[34] = amd_bytealign_S (w[19], w[20], offset); + w[33] = amd_bytealign_S (w[18], w[19], offset); + w[32] = amd_bytealign_S (w[17], w[18], offset); + w[31] = amd_bytealign_S (w[16], w[17], offset); + w[30] = amd_bytealign_S (w[15], w[16], offset); + w[29] = amd_bytealign_S (w[14], w[15], offset); + w[28] = amd_bytealign_S (w[13], w[14], offset); + w[27] = amd_bytealign_S (w[12], w[13], offset); + w[26] = amd_bytealign_S (w[11], w[12], offset); + w[25] = amd_bytealign_S (w[10], w[11], offset); + w[24] = amd_bytealign_S (w[ 9], w[10], offset); + w[23] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[22] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[21] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[20] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[19] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[18] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[17] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[16] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[15] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[14] = amd_bytealign_S ( 0, w[ 0], offset); w[13] = 0; w[12] = 0; w[11] = 0; @@ -27474,112 +45135,58 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[14] = w[15]; - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 15: - w[63] = amd_bytealign_S (w[48], w[47], offset_minus_4); - w[62] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[61] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[60] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[59] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[58] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[57] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[56] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[55] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[54] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[53] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[52] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[51] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[50] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[49] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[48] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[47] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[46] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[45] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[44] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[43] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[42] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[41] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[40] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[39] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[38] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[37] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[36] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[35] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[34] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[33] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[32] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[31] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[30] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[29] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[28] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[27] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[26] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[25] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[24] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[23] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[22] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[21] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[20] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[19] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[18] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[17] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[16] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[15] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[47], w[48], offset); + w[62] = amd_bytealign_S (w[46], w[47], offset); + w[61] = amd_bytealign_S (w[45], w[46], offset); + w[60] = amd_bytealign_S (w[44], w[45], offset); + w[59] = amd_bytealign_S (w[43], w[44], offset); + w[58] = amd_bytealign_S (w[42], w[43], offset); + w[57] = amd_bytealign_S (w[41], w[42], offset); + w[56] = amd_bytealign_S (w[40], w[41], offset); + w[55] = amd_bytealign_S (w[39], w[40], offset); + w[54] = amd_bytealign_S (w[38], w[39], offset); + w[53] = amd_bytealign_S (w[37], w[38], offset); + w[52] = amd_bytealign_S (w[36], w[37], offset); + w[51] = amd_bytealign_S (w[35], w[36], offset); + w[50] = amd_bytealign_S (w[34], w[35], offset); + w[49] = amd_bytealign_S (w[33], w[34], offset); + w[48] = amd_bytealign_S (w[32], w[33], offset); + w[47] = amd_bytealign_S (w[31], w[32], offset); + w[46] = amd_bytealign_S (w[30], w[31], offset); + w[45] = amd_bytealign_S (w[29], w[30], offset); + w[44] = amd_bytealign_S (w[28], w[29], offset); + w[43] = amd_bytealign_S (w[27], w[28], offset); + w[42] = amd_bytealign_S (w[26], w[27], offset); + w[41] = amd_bytealign_S (w[25], w[26], offset); + w[40] = amd_bytealign_S (w[24], w[25], offset); + w[39] = amd_bytealign_S (w[23], w[24], offset); + w[38] = amd_bytealign_S (w[22], w[23], offset); + w[37] = amd_bytealign_S (w[21], w[22], offset); + w[36] = amd_bytealign_S (w[20], w[21], offset); + w[35] = amd_bytealign_S (w[19], w[20], offset); + w[34] = amd_bytealign_S (w[18], w[19], offset); + w[33] = amd_bytealign_S (w[17], w[18], offset); + w[32] = amd_bytealign_S (w[16], w[17], offset); + w[31] = amd_bytealign_S (w[15], w[16], offset); + w[30] = amd_bytealign_S (w[14], w[15], offset); + w[29] = amd_bytealign_S (w[13], w[14], offset); + w[28] = amd_bytealign_S (w[12], w[13], offset); + w[27] = amd_bytealign_S (w[11], w[12], offset); + w[26] = amd_bytealign_S (w[10], w[11], offset); + w[25] = amd_bytealign_S (w[ 9], w[10], offset); + w[24] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[23] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[22] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[21] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[20] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[19] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[18] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[17] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[16] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[15] = amd_bytealign_S ( 0, w[ 0], offset); w[14] = 0; w[13] = 0; w[12] = 0; @@ -27596,110 +45203,57 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[15] = w[16]; - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 16: - w[63] = amd_bytealign_S (w[47], w[46], offset_minus_4); - w[62] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[61] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[60] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[59] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[58] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[57] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[56] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[55] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[54] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[53] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[52] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[51] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[50] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[49] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[48] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[47] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[46] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[45] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[44] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[43] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[42] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[41] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[40] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[39] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[38] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[37] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[36] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[35] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[34] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[33] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[32] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[31] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[30] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[29] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[28] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[27] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[26] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[25] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[24] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[23] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[22] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[21] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[20] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[19] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[18] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[17] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[16] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[46], w[47], offset); + w[62] = amd_bytealign_S (w[45], w[46], offset); + w[61] = amd_bytealign_S (w[44], w[45], offset); + w[60] = amd_bytealign_S (w[43], w[44], offset); + w[59] = amd_bytealign_S (w[42], w[43], offset); + w[58] = amd_bytealign_S (w[41], w[42], offset); + w[57] = amd_bytealign_S (w[40], w[41], offset); + w[56] = amd_bytealign_S (w[39], w[40], offset); + w[55] = amd_bytealign_S (w[38], w[39], offset); + w[54] = amd_bytealign_S (w[37], w[38], offset); + w[53] = amd_bytealign_S (w[36], w[37], offset); + w[52] = amd_bytealign_S (w[35], w[36], offset); + w[51] = amd_bytealign_S (w[34], w[35], offset); + w[50] = amd_bytealign_S (w[33], w[34], offset); + w[49] = amd_bytealign_S (w[32], w[33], offset); + w[48] = amd_bytealign_S (w[31], w[32], offset); + w[47] = amd_bytealign_S (w[30], w[31], offset); + w[46] = amd_bytealign_S (w[29], w[30], offset); + w[45] = amd_bytealign_S (w[28], w[29], offset); + w[44] = amd_bytealign_S (w[27], w[28], offset); + w[43] = amd_bytealign_S (w[26], w[27], offset); + w[42] = amd_bytealign_S (w[25], w[26], offset); + w[41] = amd_bytealign_S (w[24], w[25], offset); + w[40] = amd_bytealign_S (w[23], w[24], offset); + w[39] = amd_bytealign_S (w[22], w[23], offset); + w[38] = amd_bytealign_S (w[21], w[22], offset); + w[37] = amd_bytealign_S (w[20], w[21], offset); + w[36] = amd_bytealign_S (w[19], w[20], offset); + w[35] = amd_bytealign_S (w[18], w[19], offset); + w[34] = amd_bytealign_S (w[17], w[18], offset); + w[33] = amd_bytealign_S (w[16], w[17], offset); + w[32] = amd_bytealign_S (w[15], w[16], offset); + w[31] = amd_bytealign_S (w[14], w[15], offset); + w[30] = amd_bytealign_S (w[13], w[14], offset); + w[29] = amd_bytealign_S (w[12], w[13], offset); + w[28] = amd_bytealign_S (w[11], w[12], offset); + w[27] = amd_bytealign_S (w[10], w[11], offset); + w[26] = amd_bytealign_S (w[ 9], w[10], offset); + w[25] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[24] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[23] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[22] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[21] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[20] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[19] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[18] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[17] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[16] = amd_bytealign_S ( 0, w[ 0], offset); w[15] = 0; w[14] = 0; w[13] = 0; @@ -27717,108 +45271,56 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[16] = w[17]; - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 17: - w[63] = amd_bytealign_S (w[46], w[45], offset_minus_4); - w[62] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[61] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[60] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[59] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[58] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[57] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[56] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[55] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[54] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[53] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[52] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[51] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[50] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[49] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[48] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[47] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[46] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[45] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[44] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[43] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[42] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[41] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[40] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[39] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[38] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[37] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[36] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[35] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[34] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[33] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[32] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[31] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[30] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[29] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[28] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[27] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[26] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[25] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[24] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[23] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[22] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[21] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[20] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[19] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[18] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[17] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[45], w[46], offset); + w[62] = amd_bytealign_S (w[44], w[45], offset); + w[61] = amd_bytealign_S (w[43], w[44], offset); + w[60] = amd_bytealign_S (w[42], w[43], offset); + w[59] = amd_bytealign_S (w[41], w[42], offset); + w[58] = amd_bytealign_S (w[40], w[41], offset); + w[57] = amd_bytealign_S (w[39], w[40], offset); + w[56] = amd_bytealign_S (w[38], w[39], offset); + w[55] = amd_bytealign_S (w[37], w[38], offset); + w[54] = amd_bytealign_S (w[36], w[37], offset); + w[53] = amd_bytealign_S (w[35], w[36], offset); + w[52] = amd_bytealign_S (w[34], w[35], offset); + w[51] = amd_bytealign_S (w[33], w[34], offset); + w[50] = amd_bytealign_S (w[32], w[33], offset); + w[49] = amd_bytealign_S (w[31], w[32], offset); + w[48] = amd_bytealign_S (w[30], w[31], offset); + w[47] = amd_bytealign_S (w[29], w[30], offset); + w[46] = amd_bytealign_S (w[28], w[29], offset); + w[45] = amd_bytealign_S (w[27], w[28], offset); + w[44] = amd_bytealign_S (w[26], w[27], offset); + w[43] = amd_bytealign_S (w[25], w[26], offset); + w[42] = amd_bytealign_S (w[24], w[25], offset); + w[41] = amd_bytealign_S (w[23], w[24], offset); + w[40] = amd_bytealign_S (w[22], w[23], offset); + w[39] = amd_bytealign_S (w[21], w[22], offset); + w[38] = amd_bytealign_S (w[20], w[21], offset); + w[37] = amd_bytealign_S (w[19], w[20], offset); + w[36] = amd_bytealign_S (w[18], w[19], offset); + w[35] = amd_bytealign_S (w[17], w[18], offset); + w[34] = amd_bytealign_S (w[16], w[17], offset); + w[33] = amd_bytealign_S (w[15], w[16], offset); + w[32] = amd_bytealign_S (w[14], w[15], offset); + w[31] = amd_bytealign_S (w[13], w[14], offset); + w[30] = amd_bytealign_S (w[12], w[13], offset); + w[29] = amd_bytealign_S (w[11], w[12], offset); + w[28] = amd_bytealign_S (w[10], w[11], offset); + w[27] = amd_bytealign_S (w[ 9], w[10], offset); + w[26] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[25] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[24] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[23] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[22] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[21] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[20] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[19] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[18] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[17] = amd_bytealign_S ( 0, w[ 0], offset); w[16] = 0; w[15] = 0; w[14] = 0; @@ -27837,106 +45339,55 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[17] = w[18]; - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 18: - w[63] = amd_bytealign_S (w[45], w[44], offset_minus_4); - w[62] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[61] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[60] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[59] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[58] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[57] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[56] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[55] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[54] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[53] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[52] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[51] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[50] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[49] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[48] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[47] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[46] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[45] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[44] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[43] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[42] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[41] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[40] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[39] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[38] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[37] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[36] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[35] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[34] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[33] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[32] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[31] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[30] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[29] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[28] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[27] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[26] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[25] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[24] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[23] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[22] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[21] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[20] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[19] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[18] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[44], w[45], offset); + w[62] = amd_bytealign_S (w[43], w[44], offset); + w[61] = amd_bytealign_S (w[42], w[43], offset); + w[60] = amd_bytealign_S (w[41], w[42], offset); + w[59] = amd_bytealign_S (w[40], w[41], offset); + w[58] = amd_bytealign_S (w[39], w[40], offset); + w[57] = amd_bytealign_S (w[38], w[39], offset); + w[56] = amd_bytealign_S (w[37], w[38], offset); + w[55] = amd_bytealign_S (w[36], w[37], offset); + w[54] = amd_bytealign_S (w[35], w[36], offset); + w[53] = amd_bytealign_S (w[34], w[35], offset); + w[52] = amd_bytealign_S (w[33], w[34], offset); + w[51] = amd_bytealign_S (w[32], w[33], offset); + w[50] = amd_bytealign_S (w[31], w[32], offset); + w[49] = amd_bytealign_S (w[30], w[31], offset); + w[48] = amd_bytealign_S (w[29], w[30], offset); + w[47] = amd_bytealign_S (w[28], w[29], offset); + w[46] = amd_bytealign_S (w[27], w[28], offset); + w[45] = amd_bytealign_S (w[26], w[27], offset); + w[44] = amd_bytealign_S (w[25], w[26], offset); + w[43] = amd_bytealign_S (w[24], w[25], offset); + w[42] = amd_bytealign_S (w[23], w[24], offset); + w[41] = amd_bytealign_S (w[22], w[23], offset); + w[40] = amd_bytealign_S (w[21], w[22], offset); + w[39] = amd_bytealign_S (w[20], w[21], offset); + w[38] = amd_bytealign_S (w[19], w[20], offset); + w[37] = amd_bytealign_S (w[18], w[19], offset); + w[36] = amd_bytealign_S (w[17], w[18], offset); + w[35] = amd_bytealign_S (w[16], w[17], offset); + w[34] = amd_bytealign_S (w[15], w[16], offset); + w[33] = amd_bytealign_S (w[14], w[15], offset); + w[32] = amd_bytealign_S (w[13], w[14], offset); + w[31] = amd_bytealign_S (w[12], w[13], offset); + w[30] = amd_bytealign_S (w[11], w[12], offset); + w[29] = amd_bytealign_S (w[10], w[11], offset); + w[28] = amd_bytealign_S (w[ 9], w[10], offset); + w[27] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[26] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[25] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[24] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[23] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[22] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[21] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[20] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[19] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[18] = amd_bytealign_S ( 0, w[ 0], offset); w[17] = 0; w[16] = 0; w[15] = 0; @@ -27956,104 +45407,54 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[18] = w[19]; - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 19: - w[63] = amd_bytealign_S (w[44], w[43], offset_minus_4); - w[62] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[61] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[60] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[59] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[58] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[57] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[56] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[55] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[54] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[53] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[52] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[51] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[50] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[49] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[48] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[47] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[46] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[45] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[44] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[43] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[42] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[41] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[40] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[39] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[38] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[37] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[36] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[35] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[34] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[33] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[32] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[31] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[30] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[29] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[28] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[27] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[26] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[25] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[24] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[23] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[22] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[21] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[20] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[19] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[43], w[44], offset); + w[62] = amd_bytealign_S (w[42], w[43], offset); + w[61] = amd_bytealign_S (w[41], w[42], offset); + w[60] = amd_bytealign_S (w[40], w[41], offset); + w[59] = amd_bytealign_S (w[39], w[40], offset); + w[58] = amd_bytealign_S (w[38], w[39], offset); + w[57] = amd_bytealign_S (w[37], w[38], offset); + w[56] = amd_bytealign_S (w[36], w[37], offset); + w[55] = amd_bytealign_S (w[35], w[36], offset); + w[54] = amd_bytealign_S (w[34], w[35], offset); + w[53] = amd_bytealign_S (w[33], w[34], offset); + w[52] = amd_bytealign_S (w[32], w[33], offset); + w[51] = amd_bytealign_S (w[31], w[32], offset); + w[50] = amd_bytealign_S (w[30], w[31], offset); + w[49] = amd_bytealign_S (w[29], w[30], offset); + w[48] = amd_bytealign_S (w[28], w[29], offset); + w[47] = amd_bytealign_S (w[27], w[28], offset); + w[46] = amd_bytealign_S (w[26], w[27], offset); + w[45] = amd_bytealign_S (w[25], w[26], offset); + w[44] = amd_bytealign_S (w[24], w[25], offset); + w[43] = amd_bytealign_S (w[23], w[24], offset); + w[42] = amd_bytealign_S (w[22], w[23], offset); + w[41] = amd_bytealign_S (w[21], w[22], offset); + w[40] = amd_bytealign_S (w[20], w[21], offset); + w[39] = amd_bytealign_S (w[19], w[20], offset); + w[38] = amd_bytealign_S (w[18], w[19], offset); + w[37] = amd_bytealign_S (w[17], w[18], offset); + w[36] = amd_bytealign_S (w[16], w[17], offset); + w[35] = amd_bytealign_S (w[15], w[16], offset); + w[34] = amd_bytealign_S (w[14], w[15], offset); + w[33] = amd_bytealign_S (w[13], w[14], offset); + w[32] = amd_bytealign_S (w[12], w[13], offset); + w[31] = amd_bytealign_S (w[11], w[12], offset); + w[30] = amd_bytealign_S (w[10], w[11], offset); + w[29] = amd_bytealign_S (w[ 9], w[10], offset); + w[28] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[27] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[26] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[25] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[24] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[23] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[22] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[21] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[20] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[19] = amd_bytealign_S ( 0, w[ 0], offset); w[18] = 0; w[17] = 0; w[16] = 0; @@ -28074,102 +45475,53 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[19] = w[20]; - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 20: - w[63] = amd_bytealign_S (w[43], w[42], offset_minus_4); - w[62] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[61] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[60] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[59] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[58] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[57] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[56] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[55] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[54] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[53] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[52] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[51] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[50] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[49] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[48] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[47] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[46] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[45] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[44] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[43] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[42] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[41] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[40] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[39] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[38] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[37] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[36] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[35] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[34] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[33] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[32] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[31] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[30] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[29] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[28] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[27] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[26] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[25] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[24] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[23] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[22] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[21] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[20] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[42], w[43], offset); + w[62] = amd_bytealign_S (w[41], w[42], offset); + w[61] = amd_bytealign_S (w[40], w[41], offset); + w[60] = amd_bytealign_S (w[39], w[40], offset); + w[59] = amd_bytealign_S (w[38], w[39], offset); + w[58] = amd_bytealign_S (w[37], w[38], offset); + w[57] = amd_bytealign_S (w[36], w[37], offset); + w[56] = amd_bytealign_S (w[35], w[36], offset); + w[55] = amd_bytealign_S (w[34], w[35], offset); + w[54] = amd_bytealign_S (w[33], w[34], offset); + w[53] = amd_bytealign_S (w[32], w[33], offset); + w[52] = amd_bytealign_S (w[31], w[32], offset); + w[51] = amd_bytealign_S (w[30], w[31], offset); + w[50] = amd_bytealign_S (w[29], w[30], offset); + w[49] = amd_bytealign_S (w[28], w[29], offset); + w[48] = amd_bytealign_S (w[27], w[28], offset); + w[47] = amd_bytealign_S (w[26], w[27], offset); + w[46] = amd_bytealign_S (w[25], w[26], offset); + w[45] = amd_bytealign_S (w[24], w[25], offset); + w[44] = amd_bytealign_S (w[23], w[24], offset); + w[43] = amd_bytealign_S (w[22], w[23], offset); + w[42] = amd_bytealign_S (w[21], w[22], offset); + w[41] = amd_bytealign_S (w[20], w[21], offset); + w[40] = amd_bytealign_S (w[19], w[20], offset); + w[39] = amd_bytealign_S (w[18], w[19], offset); + w[38] = amd_bytealign_S (w[17], w[18], offset); + w[37] = amd_bytealign_S (w[16], w[17], offset); + w[36] = amd_bytealign_S (w[15], w[16], offset); + w[35] = amd_bytealign_S (w[14], w[15], offset); + w[34] = amd_bytealign_S (w[13], w[14], offset); + w[33] = amd_bytealign_S (w[12], w[13], offset); + w[32] = amd_bytealign_S (w[11], w[12], offset); + w[31] = amd_bytealign_S (w[10], w[11], offset); + w[30] = amd_bytealign_S (w[ 9], w[10], offset); + w[29] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[28] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[27] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[26] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[25] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[24] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[23] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[22] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[21] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[20] = amd_bytealign_S ( 0, w[ 0], offset); w[19] = 0; w[18] = 0; w[17] = 0; @@ -28191,100 +45543,52 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[20] = w[21]; - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 21: - w[63] = amd_bytealign_S (w[42], w[41], offset_minus_4); - w[62] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[61] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[60] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[59] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[58] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[57] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[56] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[55] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[54] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[53] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[52] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[51] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[50] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[49] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[48] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[47] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[46] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[45] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[44] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[43] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[42] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[41] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[40] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[39] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[38] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[37] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[36] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[35] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[34] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[33] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[32] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[31] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[30] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[29] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[28] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[27] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[26] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[25] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[24] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[23] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[22] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[21] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[41], w[42], offset); + w[62] = amd_bytealign_S (w[40], w[41], offset); + w[61] = amd_bytealign_S (w[39], w[40], offset); + w[60] = amd_bytealign_S (w[38], w[39], offset); + w[59] = amd_bytealign_S (w[37], w[38], offset); + w[58] = amd_bytealign_S (w[36], w[37], offset); + w[57] = amd_bytealign_S (w[35], w[36], offset); + w[56] = amd_bytealign_S (w[34], w[35], offset); + w[55] = amd_bytealign_S (w[33], w[34], offset); + w[54] = amd_bytealign_S (w[32], w[33], offset); + w[53] = amd_bytealign_S (w[31], w[32], offset); + w[52] = amd_bytealign_S (w[30], w[31], offset); + w[51] = amd_bytealign_S (w[29], w[30], offset); + w[50] = amd_bytealign_S (w[28], w[29], offset); + w[49] = amd_bytealign_S (w[27], w[28], offset); + w[48] = amd_bytealign_S (w[26], w[27], offset); + w[47] = amd_bytealign_S (w[25], w[26], offset); + w[46] = amd_bytealign_S (w[24], w[25], offset); + w[45] = amd_bytealign_S (w[23], w[24], offset); + w[44] = amd_bytealign_S (w[22], w[23], offset); + w[43] = amd_bytealign_S (w[21], w[22], offset); + w[42] = amd_bytealign_S (w[20], w[21], offset); + w[41] = amd_bytealign_S (w[19], w[20], offset); + w[40] = amd_bytealign_S (w[18], w[19], offset); + w[39] = amd_bytealign_S (w[17], w[18], offset); + w[38] = amd_bytealign_S (w[16], w[17], offset); + w[37] = amd_bytealign_S (w[15], w[16], offset); + w[36] = amd_bytealign_S (w[14], w[15], offset); + w[35] = amd_bytealign_S (w[13], w[14], offset); + w[34] = amd_bytealign_S (w[12], w[13], offset); + w[33] = amd_bytealign_S (w[11], w[12], offset); + w[32] = amd_bytealign_S (w[10], w[11], offset); + w[31] = amd_bytealign_S (w[ 9], w[10], offset); + w[30] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[29] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[28] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[27] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[26] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[25] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[24] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[23] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[22] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[21] = amd_bytealign_S ( 0, w[ 0], offset); w[20] = 0; w[19] = 0; w[18] = 0; @@ -28307,98 +45611,51 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[21] = w[22]; - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 22: - w[63] = amd_bytealign_S (w[41], w[40], offset_minus_4); - w[62] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[61] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[60] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[59] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[58] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[57] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[56] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[55] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[54] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[53] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[52] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[51] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[50] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[49] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[48] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[47] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[46] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[45] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[44] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[43] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[42] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[41] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[40] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[39] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[38] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[37] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[36] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[35] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[34] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[33] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[32] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[31] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[30] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[29] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[28] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[27] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[26] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[25] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[24] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[23] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[22] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[40], w[41], offset); + w[62] = amd_bytealign_S (w[39], w[40], offset); + w[61] = amd_bytealign_S (w[38], w[39], offset); + w[60] = amd_bytealign_S (w[37], w[38], offset); + w[59] = amd_bytealign_S (w[36], w[37], offset); + w[58] = amd_bytealign_S (w[35], w[36], offset); + w[57] = amd_bytealign_S (w[34], w[35], offset); + w[56] = amd_bytealign_S (w[33], w[34], offset); + w[55] = amd_bytealign_S (w[32], w[33], offset); + w[54] = amd_bytealign_S (w[31], w[32], offset); + w[53] = amd_bytealign_S (w[30], w[31], offset); + w[52] = amd_bytealign_S (w[29], w[30], offset); + w[51] = amd_bytealign_S (w[28], w[29], offset); + w[50] = amd_bytealign_S (w[27], w[28], offset); + w[49] = amd_bytealign_S (w[26], w[27], offset); + w[48] = amd_bytealign_S (w[25], w[26], offset); + w[47] = amd_bytealign_S (w[24], w[25], offset); + w[46] = amd_bytealign_S (w[23], w[24], offset); + w[45] = amd_bytealign_S (w[22], w[23], offset); + w[44] = amd_bytealign_S (w[21], w[22], offset); + w[43] = amd_bytealign_S (w[20], w[21], offset); + w[42] = amd_bytealign_S (w[19], w[20], offset); + w[41] = amd_bytealign_S (w[18], w[19], offset); + w[40] = amd_bytealign_S (w[17], w[18], offset); + w[39] = amd_bytealign_S (w[16], w[17], offset); + w[38] = amd_bytealign_S (w[15], w[16], offset); + w[37] = amd_bytealign_S (w[14], w[15], offset); + w[36] = amd_bytealign_S (w[13], w[14], offset); + w[35] = amd_bytealign_S (w[12], w[13], offset); + w[34] = amd_bytealign_S (w[11], w[12], offset); + w[33] = amd_bytealign_S (w[10], w[11], offset); + w[32] = amd_bytealign_S (w[ 9], w[10], offset); + w[31] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[30] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[29] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[28] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[27] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[26] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[25] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[24] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[23] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[22] = amd_bytealign_S ( 0, w[ 0], offset); w[21] = 0; w[20] = 0; w[19] = 0; @@ -28422,96 +45679,50 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[22] = w[23]; - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 23: - w[63] = amd_bytealign_S (w[40], w[39], offset_minus_4); - w[62] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[61] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[60] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[59] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[58] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[57] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[56] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[55] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[54] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[53] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[52] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[51] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[50] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[49] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[48] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[47] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[46] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[45] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[44] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[43] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[42] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[41] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[40] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[39] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[38] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[37] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[36] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[35] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[34] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[33] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[32] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[31] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[30] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[29] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[28] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[27] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[26] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[25] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[24] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[23] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[39], w[40], offset); + w[62] = amd_bytealign_S (w[38], w[39], offset); + w[61] = amd_bytealign_S (w[37], w[38], offset); + w[60] = amd_bytealign_S (w[36], w[37], offset); + w[59] = amd_bytealign_S (w[35], w[36], offset); + w[58] = amd_bytealign_S (w[34], w[35], offset); + w[57] = amd_bytealign_S (w[33], w[34], offset); + w[56] = amd_bytealign_S (w[32], w[33], offset); + w[55] = amd_bytealign_S (w[31], w[32], offset); + w[54] = amd_bytealign_S (w[30], w[31], offset); + w[53] = amd_bytealign_S (w[29], w[30], offset); + w[52] = amd_bytealign_S (w[28], w[29], offset); + w[51] = amd_bytealign_S (w[27], w[28], offset); + w[50] = amd_bytealign_S (w[26], w[27], offset); + w[49] = amd_bytealign_S (w[25], w[26], offset); + w[48] = amd_bytealign_S (w[24], w[25], offset); + w[47] = amd_bytealign_S (w[23], w[24], offset); + w[46] = amd_bytealign_S (w[22], w[23], offset); + w[45] = amd_bytealign_S (w[21], w[22], offset); + w[44] = amd_bytealign_S (w[20], w[21], offset); + w[43] = amd_bytealign_S (w[19], w[20], offset); + w[42] = amd_bytealign_S (w[18], w[19], offset); + w[41] = amd_bytealign_S (w[17], w[18], offset); + w[40] = amd_bytealign_S (w[16], w[17], offset); + w[39] = amd_bytealign_S (w[15], w[16], offset); + w[38] = amd_bytealign_S (w[14], w[15], offset); + w[37] = amd_bytealign_S (w[13], w[14], offset); + w[36] = amd_bytealign_S (w[12], w[13], offset); + w[35] = amd_bytealign_S (w[11], w[12], offset); + w[34] = amd_bytealign_S (w[10], w[11], offset); + w[33] = amd_bytealign_S (w[ 9], w[10], offset); + w[32] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[31] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[30] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[29] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[28] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[27] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[26] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[25] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[24] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[23] = amd_bytealign_S ( 0, w[ 0], offset); w[22] = 0; w[21] = 0; w[20] = 0; @@ -28536,94 +45747,49 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[23] = w[24]; - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 24: - w[63] = amd_bytealign_S (w[39], w[38], offset_minus_4); - w[62] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[61] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[60] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[59] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[58] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[57] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[56] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[55] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[54] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[53] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[52] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[51] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[50] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[49] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[48] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[47] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[46] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[45] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[44] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[43] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[42] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[41] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[40] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[39] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[38] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[37] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[36] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[35] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[34] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[33] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[32] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[31] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[30] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[29] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[28] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[27] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[26] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[25] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[24] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[38], w[39], offset); + w[62] = amd_bytealign_S (w[37], w[38], offset); + w[61] = amd_bytealign_S (w[36], w[37], offset); + w[60] = amd_bytealign_S (w[35], w[36], offset); + w[59] = amd_bytealign_S (w[34], w[35], offset); + w[58] = amd_bytealign_S (w[33], w[34], offset); + w[57] = amd_bytealign_S (w[32], w[33], offset); + w[56] = amd_bytealign_S (w[31], w[32], offset); + w[55] = amd_bytealign_S (w[30], w[31], offset); + w[54] = amd_bytealign_S (w[29], w[30], offset); + w[53] = amd_bytealign_S (w[28], w[29], offset); + w[52] = amd_bytealign_S (w[27], w[28], offset); + w[51] = amd_bytealign_S (w[26], w[27], offset); + w[50] = amd_bytealign_S (w[25], w[26], offset); + w[49] = amd_bytealign_S (w[24], w[25], offset); + w[48] = amd_bytealign_S (w[23], w[24], offset); + w[47] = amd_bytealign_S (w[22], w[23], offset); + w[46] = amd_bytealign_S (w[21], w[22], offset); + w[45] = amd_bytealign_S (w[20], w[21], offset); + w[44] = amd_bytealign_S (w[19], w[20], offset); + w[43] = amd_bytealign_S (w[18], w[19], offset); + w[42] = amd_bytealign_S (w[17], w[18], offset); + w[41] = amd_bytealign_S (w[16], w[17], offset); + w[40] = amd_bytealign_S (w[15], w[16], offset); + w[39] = amd_bytealign_S (w[14], w[15], offset); + w[38] = amd_bytealign_S (w[13], w[14], offset); + w[37] = amd_bytealign_S (w[12], w[13], offset); + w[36] = amd_bytealign_S (w[11], w[12], offset); + w[35] = amd_bytealign_S (w[10], w[11], offset); + w[34] = amd_bytealign_S (w[ 9], w[10], offset); + w[33] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[32] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[31] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[30] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[29] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[28] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[27] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[26] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[25] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[24] = amd_bytealign_S ( 0, w[ 0], offset); w[23] = 0; w[22] = 0; w[21] = 0; @@ -28649,92 +45815,48 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[24] = w[25]; - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 25: - w[63] = amd_bytealign_S (w[38], w[37], offset_minus_4); - w[62] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[61] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[60] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[59] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[58] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[57] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[56] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[55] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[54] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[53] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[52] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[51] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[50] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[49] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[48] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[47] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[46] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[45] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[44] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[43] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[42] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[41] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[40] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[39] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[38] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[37] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[36] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[35] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[34] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[33] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[32] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[31] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[30] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[29] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[28] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[27] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[26] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[25] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[37], w[38], offset); + w[62] = amd_bytealign_S (w[36], w[37], offset); + w[61] = amd_bytealign_S (w[35], w[36], offset); + w[60] = amd_bytealign_S (w[34], w[35], offset); + w[59] = amd_bytealign_S (w[33], w[34], offset); + w[58] = amd_bytealign_S (w[32], w[33], offset); + w[57] = amd_bytealign_S (w[31], w[32], offset); + w[56] = amd_bytealign_S (w[30], w[31], offset); + w[55] = amd_bytealign_S (w[29], w[30], offset); + w[54] = amd_bytealign_S (w[28], w[29], offset); + w[53] = amd_bytealign_S (w[27], w[28], offset); + w[52] = amd_bytealign_S (w[26], w[27], offset); + w[51] = amd_bytealign_S (w[25], w[26], offset); + w[50] = amd_bytealign_S (w[24], w[25], offset); + w[49] = amd_bytealign_S (w[23], w[24], offset); + w[48] = amd_bytealign_S (w[22], w[23], offset); + w[47] = amd_bytealign_S (w[21], w[22], offset); + w[46] = amd_bytealign_S (w[20], w[21], offset); + w[45] = amd_bytealign_S (w[19], w[20], offset); + w[44] = amd_bytealign_S (w[18], w[19], offset); + w[43] = amd_bytealign_S (w[17], w[18], offset); + w[42] = amd_bytealign_S (w[16], w[17], offset); + w[41] = amd_bytealign_S (w[15], w[16], offset); + w[40] = amd_bytealign_S (w[14], w[15], offset); + w[39] = amd_bytealign_S (w[13], w[14], offset); + w[38] = amd_bytealign_S (w[12], w[13], offset); + w[37] = amd_bytealign_S (w[11], w[12], offset); + w[36] = amd_bytealign_S (w[10], w[11], offset); + w[35] = amd_bytealign_S (w[ 9], w[10], offset); + w[34] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[33] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[32] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[31] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[30] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[29] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[28] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[27] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[26] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[25] = amd_bytealign_S ( 0, w[ 0], offset); w[24] = 0; w[23] = 0; w[22] = 0; @@ -28761,90 +45883,47 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[25] = w[26]; - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 26: - w[63] = amd_bytealign_S (w[37], w[36], offset_minus_4); - w[62] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[61] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[60] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[59] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[58] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[57] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[56] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[55] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[54] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[53] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[52] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[51] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[50] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[49] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[48] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[47] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[46] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[45] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[44] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[43] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[42] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[41] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[40] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[39] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[38] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[37] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[36] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[35] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[34] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[33] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[32] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[31] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[30] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[29] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[28] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[27] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[26] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[36], w[37], offset); + w[62] = amd_bytealign_S (w[35], w[36], offset); + w[61] = amd_bytealign_S (w[34], w[35], offset); + w[60] = amd_bytealign_S (w[33], w[34], offset); + w[59] = amd_bytealign_S (w[32], w[33], offset); + w[58] = amd_bytealign_S (w[31], w[32], offset); + w[57] = amd_bytealign_S (w[30], w[31], offset); + w[56] = amd_bytealign_S (w[29], w[30], offset); + w[55] = amd_bytealign_S (w[28], w[29], offset); + w[54] = amd_bytealign_S (w[27], w[28], offset); + w[53] = amd_bytealign_S (w[26], w[27], offset); + w[52] = amd_bytealign_S (w[25], w[26], offset); + w[51] = amd_bytealign_S (w[24], w[25], offset); + w[50] = amd_bytealign_S (w[23], w[24], offset); + w[49] = amd_bytealign_S (w[22], w[23], offset); + w[48] = amd_bytealign_S (w[21], w[22], offset); + w[47] = amd_bytealign_S (w[20], w[21], offset); + w[46] = amd_bytealign_S (w[19], w[20], offset); + w[45] = amd_bytealign_S (w[18], w[19], offset); + w[44] = amd_bytealign_S (w[17], w[18], offset); + w[43] = amd_bytealign_S (w[16], w[17], offset); + w[42] = amd_bytealign_S (w[15], w[16], offset); + w[41] = amd_bytealign_S (w[14], w[15], offset); + w[40] = amd_bytealign_S (w[13], w[14], offset); + w[39] = amd_bytealign_S (w[12], w[13], offset); + w[38] = amd_bytealign_S (w[11], w[12], offset); + w[37] = amd_bytealign_S (w[10], w[11], offset); + w[36] = amd_bytealign_S (w[ 9], w[10], offset); + w[35] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[34] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[33] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[32] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[31] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[30] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[29] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[28] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[27] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[26] = amd_bytealign_S ( 0, w[ 0], offset); w[25] = 0; w[24] = 0; w[23] = 0; @@ -28872,88 +45951,46 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[26] = w[27]; - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 27: - w[63] = amd_bytealign_S (w[36], w[35], offset_minus_4); - w[62] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[61] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[60] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[59] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[58] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[57] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[56] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[55] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[54] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[53] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[52] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[51] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[50] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[49] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[48] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[47] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[46] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[45] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[44] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[43] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[42] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[41] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[40] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[39] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[38] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[37] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[36] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[35] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[34] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[33] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[32] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[31] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[30] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[29] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[28] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[27] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[35], w[36], offset); + w[62] = amd_bytealign_S (w[34], w[35], offset); + w[61] = amd_bytealign_S (w[33], w[34], offset); + w[60] = amd_bytealign_S (w[32], w[33], offset); + w[59] = amd_bytealign_S (w[31], w[32], offset); + w[58] = amd_bytealign_S (w[30], w[31], offset); + w[57] = amd_bytealign_S (w[29], w[30], offset); + w[56] = amd_bytealign_S (w[28], w[29], offset); + w[55] = amd_bytealign_S (w[27], w[28], offset); + w[54] = amd_bytealign_S (w[26], w[27], offset); + w[53] = amd_bytealign_S (w[25], w[26], offset); + w[52] = amd_bytealign_S (w[24], w[25], offset); + w[51] = amd_bytealign_S (w[23], w[24], offset); + w[50] = amd_bytealign_S (w[22], w[23], offset); + w[49] = amd_bytealign_S (w[21], w[22], offset); + w[48] = amd_bytealign_S (w[20], w[21], offset); + w[47] = amd_bytealign_S (w[19], w[20], offset); + w[46] = amd_bytealign_S (w[18], w[19], offset); + w[45] = amd_bytealign_S (w[17], w[18], offset); + w[44] = amd_bytealign_S (w[16], w[17], offset); + w[43] = amd_bytealign_S (w[15], w[16], offset); + w[42] = amd_bytealign_S (w[14], w[15], offset); + w[41] = amd_bytealign_S (w[13], w[14], offset); + w[40] = amd_bytealign_S (w[12], w[13], offset); + w[39] = amd_bytealign_S (w[11], w[12], offset); + w[38] = amd_bytealign_S (w[10], w[11], offset); + w[37] = amd_bytealign_S (w[ 9], w[10], offset); + w[36] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[35] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[34] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[33] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[32] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[31] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[30] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[29] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[28] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[27] = amd_bytealign_S ( 0, w[ 0], offset); w[26] = 0; w[25] = 0; w[24] = 0; @@ -28982,86 +46019,45 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[27] = w[28]; - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 28: - w[63] = amd_bytealign_S (w[35], w[34], offset_minus_4); - w[62] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[61] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[60] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[59] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[58] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[57] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[56] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[55] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[54] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[53] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[52] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[51] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[50] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[49] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[48] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[47] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[46] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[45] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[44] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[43] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[42] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[41] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[40] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[39] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[38] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[37] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[36] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[35] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[34] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[33] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[32] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[31] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[30] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[29] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[28] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[34], w[35], offset); + w[62] = amd_bytealign_S (w[33], w[34], offset); + w[61] = amd_bytealign_S (w[32], w[33], offset); + w[60] = amd_bytealign_S (w[31], w[32], offset); + w[59] = amd_bytealign_S (w[30], w[31], offset); + w[58] = amd_bytealign_S (w[29], w[30], offset); + w[57] = amd_bytealign_S (w[28], w[29], offset); + w[56] = amd_bytealign_S (w[27], w[28], offset); + w[55] = amd_bytealign_S (w[26], w[27], offset); + w[54] = amd_bytealign_S (w[25], w[26], offset); + w[53] = amd_bytealign_S (w[24], w[25], offset); + w[52] = amd_bytealign_S (w[23], w[24], offset); + w[51] = amd_bytealign_S (w[22], w[23], offset); + w[50] = amd_bytealign_S (w[21], w[22], offset); + w[49] = amd_bytealign_S (w[20], w[21], offset); + w[48] = amd_bytealign_S (w[19], w[20], offset); + w[47] = amd_bytealign_S (w[18], w[19], offset); + w[46] = amd_bytealign_S (w[17], w[18], offset); + w[45] = amd_bytealign_S (w[16], w[17], offset); + w[44] = amd_bytealign_S (w[15], w[16], offset); + w[43] = amd_bytealign_S (w[14], w[15], offset); + w[42] = amd_bytealign_S (w[13], w[14], offset); + w[41] = amd_bytealign_S (w[12], w[13], offset); + w[40] = amd_bytealign_S (w[11], w[12], offset); + w[39] = amd_bytealign_S (w[10], w[11], offset); + w[38] = amd_bytealign_S (w[ 9], w[10], offset); + w[37] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[36] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[35] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[34] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[33] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[32] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[31] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[30] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[29] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[28] = amd_bytealign_S ( 0, w[ 0], offset); w[27] = 0; w[26] = 0; w[25] = 0; @@ -29091,84 +46087,44 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[28] = w[29]; - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 29: - w[63] = amd_bytealign_S (w[34], w[33], offset_minus_4); - w[62] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[61] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[60] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[59] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[58] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[57] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[56] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[55] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[54] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[53] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[52] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[51] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[50] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[49] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[48] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[47] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[46] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[45] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[44] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[43] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[42] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[41] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[40] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[39] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[38] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[37] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[36] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[35] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[34] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[33] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[32] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[31] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[30] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[29] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[33], w[34], offset); + w[62] = amd_bytealign_S (w[32], w[33], offset); + w[61] = amd_bytealign_S (w[31], w[32], offset); + w[60] = amd_bytealign_S (w[30], w[31], offset); + w[59] = amd_bytealign_S (w[29], w[30], offset); + w[58] = amd_bytealign_S (w[28], w[29], offset); + w[57] = amd_bytealign_S (w[27], w[28], offset); + w[56] = amd_bytealign_S (w[26], w[27], offset); + w[55] = amd_bytealign_S (w[25], w[26], offset); + w[54] = amd_bytealign_S (w[24], w[25], offset); + w[53] = amd_bytealign_S (w[23], w[24], offset); + w[52] = amd_bytealign_S (w[22], w[23], offset); + w[51] = amd_bytealign_S (w[21], w[22], offset); + w[50] = amd_bytealign_S (w[20], w[21], offset); + w[49] = amd_bytealign_S (w[19], w[20], offset); + w[48] = amd_bytealign_S (w[18], w[19], offset); + w[47] = amd_bytealign_S (w[17], w[18], offset); + w[46] = amd_bytealign_S (w[16], w[17], offset); + w[45] = amd_bytealign_S (w[15], w[16], offset); + w[44] = amd_bytealign_S (w[14], w[15], offset); + w[43] = amd_bytealign_S (w[13], w[14], offset); + w[42] = amd_bytealign_S (w[12], w[13], offset); + w[41] = amd_bytealign_S (w[11], w[12], offset); + w[40] = amd_bytealign_S (w[10], w[11], offset); + w[39] = amd_bytealign_S (w[ 9], w[10], offset); + w[38] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[37] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[36] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[35] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[34] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[33] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[32] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[31] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[30] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[29] = amd_bytealign_S ( 0, w[ 0], offset); w[28] = 0; w[27] = 0; w[26] = 0; @@ -29199,82 +46155,43 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[29] = w[30]; - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 30: - w[63] = amd_bytealign_S (w[33], w[32], offset_minus_4); - w[62] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[61] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[60] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[59] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[58] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[57] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[56] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[55] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[54] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[53] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[52] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[51] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[50] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[49] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[48] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[47] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[46] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[45] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[44] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[43] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[42] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[41] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[40] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[39] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[38] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[37] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[36] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[35] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[34] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[33] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[32] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[31] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[30] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[32], w[33], offset); + w[62] = amd_bytealign_S (w[31], w[32], offset); + w[61] = amd_bytealign_S (w[30], w[31], offset); + w[60] = amd_bytealign_S (w[29], w[30], offset); + w[59] = amd_bytealign_S (w[28], w[29], offset); + w[58] = amd_bytealign_S (w[27], w[28], offset); + w[57] = amd_bytealign_S (w[26], w[27], offset); + w[56] = amd_bytealign_S (w[25], w[26], offset); + w[55] = amd_bytealign_S (w[24], w[25], offset); + w[54] = amd_bytealign_S (w[23], w[24], offset); + w[53] = amd_bytealign_S (w[22], w[23], offset); + w[52] = amd_bytealign_S (w[21], w[22], offset); + w[51] = amd_bytealign_S (w[20], w[21], offset); + w[50] = amd_bytealign_S (w[19], w[20], offset); + w[49] = amd_bytealign_S (w[18], w[19], offset); + w[48] = amd_bytealign_S (w[17], w[18], offset); + w[47] = amd_bytealign_S (w[16], w[17], offset); + w[46] = amd_bytealign_S (w[15], w[16], offset); + w[45] = amd_bytealign_S (w[14], w[15], offset); + w[44] = amd_bytealign_S (w[13], w[14], offset); + w[43] = amd_bytealign_S (w[12], w[13], offset); + w[42] = amd_bytealign_S (w[11], w[12], offset); + w[41] = amd_bytealign_S (w[10], w[11], offset); + w[40] = amd_bytealign_S (w[ 9], w[10], offset); + w[39] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[38] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[37] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[36] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[35] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[34] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[33] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[32] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[31] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[30] = amd_bytealign_S ( 0, w[ 0], offset); w[29] = 0; w[28] = 0; w[27] = 0; @@ -29306,80 +46223,42 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[30] = w[31]; - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 31: - w[63] = amd_bytealign_S (w[32], w[31], offset_minus_4); - w[62] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[61] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[60] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[59] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[58] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[57] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[56] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[55] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[54] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[53] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[52] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[51] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[50] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[49] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[48] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[47] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[46] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[45] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[44] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[43] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[42] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[41] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[40] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[39] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[38] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[37] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[36] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[35] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[34] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[33] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[32] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[31] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[31], w[32], offset); + w[62] = amd_bytealign_S (w[30], w[31], offset); + w[61] = amd_bytealign_S (w[29], w[30], offset); + w[60] = amd_bytealign_S (w[28], w[29], offset); + w[59] = amd_bytealign_S (w[27], w[28], offset); + w[58] = amd_bytealign_S (w[26], w[27], offset); + w[57] = amd_bytealign_S (w[25], w[26], offset); + w[56] = amd_bytealign_S (w[24], w[25], offset); + w[55] = amd_bytealign_S (w[23], w[24], offset); + w[54] = amd_bytealign_S (w[22], w[23], offset); + w[53] = amd_bytealign_S (w[21], w[22], offset); + w[52] = amd_bytealign_S (w[20], w[21], offset); + w[51] = amd_bytealign_S (w[19], w[20], offset); + w[50] = amd_bytealign_S (w[18], w[19], offset); + w[49] = amd_bytealign_S (w[17], w[18], offset); + w[48] = amd_bytealign_S (w[16], w[17], offset); + w[47] = amd_bytealign_S (w[15], w[16], offset); + w[46] = amd_bytealign_S (w[14], w[15], offset); + w[45] = amd_bytealign_S (w[13], w[14], offset); + w[44] = amd_bytealign_S (w[12], w[13], offset); + w[43] = amd_bytealign_S (w[11], w[12], offset); + w[42] = amd_bytealign_S (w[10], w[11], offset); + w[41] = amd_bytealign_S (w[ 9], w[10], offset); + w[40] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[39] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[38] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[37] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[36] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[35] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[34] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[33] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[32] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[31] = amd_bytealign_S ( 0, w[ 0], offset); w[30] = 0; w[29] = 0; w[28] = 0; @@ -29412,78 +46291,41 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[31] = w[32]; - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 32: - w[63] = amd_bytealign_S (w[31], w[30], offset_minus_4); - w[62] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[61] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[60] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[59] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[58] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[57] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[56] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[55] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[54] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[53] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[52] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[51] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[50] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[49] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[48] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[47] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[46] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[45] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[44] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[43] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[42] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[41] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[40] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[39] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[38] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[37] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[36] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[35] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[34] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[33] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[32] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[30], w[31], offset); + w[62] = amd_bytealign_S (w[29], w[30], offset); + w[61] = amd_bytealign_S (w[28], w[29], offset); + w[60] = amd_bytealign_S (w[27], w[28], offset); + w[59] = amd_bytealign_S (w[26], w[27], offset); + w[58] = amd_bytealign_S (w[25], w[26], offset); + w[57] = amd_bytealign_S (w[24], w[25], offset); + w[56] = amd_bytealign_S (w[23], w[24], offset); + w[55] = amd_bytealign_S (w[22], w[23], offset); + w[54] = amd_bytealign_S (w[21], w[22], offset); + w[53] = amd_bytealign_S (w[20], w[21], offset); + w[52] = amd_bytealign_S (w[19], w[20], offset); + w[51] = amd_bytealign_S (w[18], w[19], offset); + w[50] = amd_bytealign_S (w[17], w[18], offset); + w[49] = amd_bytealign_S (w[16], w[17], offset); + w[48] = amd_bytealign_S (w[15], w[16], offset); + w[47] = amd_bytealign_S (w[14], w[15], offset); + w[46] = amd_bytealign_S (w[13], w[14], offset); + w[45] = amd_bytealign_S (w[12], w[13], offset); + w[44] = amd_bytealign_S (w[11], w[12], offset); + w[43] = amd_bytealign_S (w[10], w[11], offset); + w[42] = amd_bytealign_S (w[ 9], w[10], offset); + w[41] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[40] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[39] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[38] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[37] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[36] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[35] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[34] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[33] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[32] = amd_bytealign_S ( 0, w[ 0], offset); w[31] = 0; w[30] = 0; w[29] = 0; @@ -29517,76 +46359,40 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[32] = w[33]; - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 33: - w[63] = amd_bytealign_S (w[30], w[29], offset_minus_4); - w[62] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[61] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[60] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[59] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[58] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[57] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[56] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[55] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[54] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[53] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[52] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[51] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[50] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[49] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[48] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[47] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[46] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[45] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[44] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[43] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[42] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[41] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[40] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[39] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[38] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[37] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[36] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[35] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[34] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[33] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[29], w[30], offset); + w[62] = amd_bytealign_S (w[28], w[29], offset); + w[61] = amd_bytealign_S (w[27], w[28], offset); + w[60] = amd_bytealign_S (w[26], w[27], offset); + w[59] = amd_bytealign_S (w[25], w[26], offset); + w[58] = amd_bytealign_S (w[24], w[25], offset); + w[57] = amd_bytealign_S (w[23], w[24], offset); + w[56] = amd_bytealign_S (w[22], w[23], offset); + w[55] = amd_bytealign_S (w[21], w[22], offset); + w[54] = amd_bytealign_S (w[20], w[21], offset); + w[53] = amd_bytealign_S (w[19], w[20], offset); + w[52] = amd_bytealign_S (w[18], w[19], offset); + w[51] = amd_bytealign_S (w[17], w[18], offset); + w[50] = amd_bytealign_S (w[16], w[17], offset); + w[49] = amd_bytealign_S (w[15], w[16], offset); + w[48] = amd_bytealign_S (w[14], w[15], offset); + w[47] = amd_bytealign_S (w[13], w[14], offset); + w[46] = amd_bytealign_S (w[12], w[13], offset); + w[45] = amd_bytealign_S (w[11], w[12], offset); + w[44] = amd_bytealign_S (w[10], w[11], offset); + w[43] = amd_bytealign_S (w[ 9], w[10], offset); + w[42] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[41] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[40] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[39] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[38] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[37] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[36] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[35] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[34] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[33] = amd_bytealign_S ( 0, w[ 0], offset); w[32] = 0; w[31] = 0; w[30] = 0; @@ -29621,74 +46427,39 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[33] = w[34]; - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 34: - w[63] = amd_bytealign_S (w[29], w[28], offset_minus_4); - w[62] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[61] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[60] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[59] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[58] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[57] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[56] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[55] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[54] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[53] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[52] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[51] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[50] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[49] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[48] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[47] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[46] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[45] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[44] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[43] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[42] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[41] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[40] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[39] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[38] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[37] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[36] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[35] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[34] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[28], w[29], offset); + w[62] = amd_bytealign_S (w[27], w[28], offset); + w[61] = amd_bytealign_S (w[26], w[27], offset); + w[60] = amd_bytealign_S (w[25], w[26], offset); + w[59] = amd_bytealign_S (w[24], w[25], offset); + w[58] = amd_bytealign_S (w[23], w[24], offset); + w[57] = amd_bytealign_S (w[22], w[23], offset); + w[56] = amd_bytealign_S (w[21], w[22], offset); + w[55] = amd_bytealign_S (w[20], w[21], offset); + w[54] = amd_bytealign_S (w[19], w[20], offset); + w[53] = amd_bytealign_S (w[18], w[19], offset); + w[52] = amd_bytealign_S (w[17], w[18], offset); + w[51] = amd_bytealign_S (w[16], w[17], offset); + w[50] = amd_bytealign_S (w[15], w[16], offset); + w[49] = amd_bytealign_S (w[14], w[15], offset); + w[48] = amd_bytealign_S (w[13], w[14], offset); + w[47] = amd_bytealign_S (w[12], w[13], offset); + w[46] = amd_bytealign_S (w[11], w[12], offset); + w[45] = amd_bytealign_S (w[10], w[11], offset); + w[44] = amd_bytealign_S (w[ 9], w[10], offset); + w[43] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[42] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[41] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[40] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[39] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[38] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[37] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[36] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[35] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[34] = amd_bytealign_S ( 0, w[ 0], offset); w[33] = 0; w[32] = 0; w[31] = 0; @@ -29724,72 +46495,38 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[34] = w[35]; - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 35: - w[63] = amd_bytealign_S (w[28], w[27], offset_minus_4); - w[62] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[61] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[60] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[59] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[58] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[57] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[56] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[55] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[54] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[53] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[52] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[51] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[50] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[49] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[48] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[47] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[46] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[45] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[44] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[43] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[42] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[41] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[40] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[39] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[38] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[37] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[36] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[35] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[27], w[28], offset); + w[62] = amd_bytealign_S (w[26], w[27], offset); + w[61] = amd_bytealign_S (w[25], w[26], offset); + w[60] = amd_bytealign_S (w[24], w[25], offset); + w[59] = amd_bytealign_S (w[23], w[24], offset); + w[58] = amd_bytealign_S (w[22], w[23], offset); + w[57] = amd_bytealign_S (w[21], w[22], offset); + w[56] = amd_bytealign_S (w[20], w[21], offset); + w[55] = amd_bytealign_S (w[19], w[20], offset); + w[54] = amd_bytealign_S (w[18], w[19], offset); + w[53] = amd_bytealign_S (w[17], w[18], offset); + w[52] = amd_bytealign_S (w[16], w[17], offset); + w[51] = amd_bytealign_S (w[15], w[16], offset); + w[50] = amd_bytealign_S (w[14], w[15], offset); + w[49] = amd_bytealign_S (w[13], w[14], offset); + w[48] = amd_bytealign_S (w[12], w[13], offset); + w[47] = amd_bytealign_S (w[11], w[12], offset); + w[46] = amd_bytealign_S (w[10], w[11], offset); + w[45] = amd_bytealign_S (w[ 9], w[10], offset); + w[44] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[43] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[42] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[41] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[40] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[39] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[38] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[37] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[36] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[35] = amd_bytealign_S ( 0, w[ 0], offset); w[34] = 0; w[33] = 0; w[32] = 0; @@ -29826,70 +46563,37 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[35] = w[36]; - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 36: - w[63] = amd_bytealign_S (w[27], w[26], offset_minus_4); - w[62] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[61] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[60] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[59] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[58] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[57] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[56] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[55] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[54] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[53] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[52] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[51] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[50] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[49] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[48] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[47] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[46] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[45] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[44] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[43] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[42] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[41] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[40] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[39] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[38] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[37] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[36] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[26], w[27], offset); + w[62] = amd_bytealign_S (w[25], w[26], offset); + w[61] = amd_bytealign_S (w[24], w[25], offset); + w[60] = amd_bytealign_S (w[23], w[24], offset); + w[59] = amd_bytealign_S (w[22], w[23], offset); + w[58] = amd_bytealign_S (w[21], w[22], offset); + w[57] = amd_bytealign_S (w[20], w[21], offset); + w[56] = amd_bytealign_S (w[19], w[20], offset); + w[55] = amd_bytealign_S (w[18], w[19], offset); + w[54] = amd_bytealign_S (w[17], w[18], offset); + w[53] = amd_bytealign_S (w[16], w[17], offset); + w[52] = amd_bytealign_S (w[15], w[16], offset); + w[51] = amd_bytealign_S (w[14], w[15], offset); + w[50] = amd_bytealign_S (w[13], w[14], offset); + w[49] = amd_bytealign_S (w[12], w[13], offset); + w[48] = amd_bytealign_S (w[11], w[12], offset); + w[47] = amd_bytealign_S (w[10], w[11], offset); + w[46] = amd_bytealign_S (w[ 9], w[10], offset); + w[45] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[44] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[43] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[42] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[41] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[40] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[39] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[38] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[37] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[36] = amd_bytealign_S ( 0, w[ 0], offset); w[35] = 0; w[34] = 0; w[33] = 0; @@ -29927,68 +46631,36 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[36] = w[37]; - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 37: - w[63] = amd_bytealign_S (w[26], w[25], offset_minus_4); - w[62] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[61] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[60] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[59] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[58] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[57] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[56] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[55] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[54] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[53] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[52] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[51] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[50] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[49] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[48] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[47] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[46] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[45] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[44] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[43] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[42] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[41] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[40] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[39] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[38] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[37] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[25], w[26], offset); + w[62] = amd_bytealign_S (w[24], w[25], offset); + w[61] = amd_bytealign_S (w[23], w[24], offset); + w[60] = amd_bytealign_S (w[22], w[23], offset); + w[59] = amd_bytealign_S (w[21], w[22], offset); + w[58] = amd_bytealign_S (w[20], w[21], offset); + w[57] = amd_bytealign_S (w[19], w[20], offset); + w[56] = amd_bytealign_S (w[18], w[19], offset); + w[55] = amd_bytealign_S (w[17], w[18], offset); + w[54] = amd_bytealign_S (w[16], w[17], offset); + w[53] = amd_bytealign_S (w[15], w[16], offset); + w[52] = amd_bytealign_S (w[14], w[15], offset); + w[51] = amd_bytealign_S (w[13], w[14], offset); + w[50] = amd_bytealign_S (w[12], w[13], offset); + w[49] = amd_bytealign_S (w[11], w[12], offset); + w[48] = amd_bytealign_S (w[10], w[11], offset); + w[47] = amd_bytealign_S (w[ 9], w[10], offset); + w[46] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[45] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[44] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[43] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[42] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[41] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[40] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[39] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[38] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[37] = amd_bytealign_S ( 0, w[ 0], offset); w[36] = 0; w[35] = 0; w[34] = 0; @@ -30027,66 +46699,35 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[37] = w[38]; - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 38: - w[63] = amd_bytealign_S (w[25], w[24], offset_minus_4); - w[62] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[61] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[60] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[59] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[58] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[57] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[56] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[55] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[54] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[53] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[52] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[51] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[50] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[49] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[48] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[47] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[46] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[45] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[44] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[43] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[42] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[41] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[40] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[39] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[38] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[24], w[25], offset); + w[62] = amd_bytealign_S (w[23], w[24], offset); + w[61] = amd_bytealign_S (w[22], w[23], offset); + w[60] = amd_bytealign_S (w[21], w[22], offset); + w[59] = amd_bytealign_S (w[20], w[21], offset); + w[58] = amd_bytealign_S (w[19], w[20], offset); + w[57] = amd_bytealign_S (w[18], w[19], offset); + w[56] = amd_bytealign_S (w[17], w[18], offset); + w[55] = amd_bytealign_S (w[16], w[17], offset); + w[54] = amd_bytealign_S (w[15], w[16], offset); + w[53] = amd_bytealign_S (w[14], w[15], offset); + w[52] = amd_bytealign_S (w[13], w[14], offset); + w[51] = amd_bytealign_S (w[12], w[13], offset); + w[50] = amd_bytealign_S (w[11], w[12], offset); + w[49] = amd_bytealign_S (w[10], w[11], offset); + w[48] = amd_bytealign_S (w[ 9], w[10], offset); + w[47] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[46] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[45] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[44] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[43] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[42] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[41] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[40] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[39] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[38] = amd_bytealign_S ( 0, w[ 0], offset); w[37] = 0; w[36] = 0; w[35] = 0; @@ -30126,64 +46767,34 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[38] = w[39]; - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 39: - w[63] = amd_bytealign_S (w[24], w[23], offset_minus_4); - w[62] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[61] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[60] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[59] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[58] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[57] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[56] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[55] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[54] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[53] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[52] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[51] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[50] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[49] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[48] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[47] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[46] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[45] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[44] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[43] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[42] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[41] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[40] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[39] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[23], w[24], offset); + w[62] = amd_bytealign_S (w[22], w[23], offset); + w[61] = amd_bytealign_S (w[21], w[22], offset); + w[60] = amd_bytealign_S (w[20], w[21], offset); + w[59] = amd_bytealign_S (w[19], w[20], offset); + w[58] = amd_bytealign_S (w[18], w[19], offset); + w[57] = amd_bytealign_S (w[17], w[18], offset); + w[56] = amd_bytealign_S (w[16], w[17], offset); + w[55] = amd_bytealign_S (w[15], w[16], offset); + w[54] = amd_bytealign_S (w[14], w[15], offset); + w[53] = amd_bytealign_S (w[13], w[14], offset); + w[52] = amd_bytealign_S (w[12], w[13], offset); + w[51] = amd_bytealign_S (w[11], w[12], offset); + w[50] = amd_bytealign_S (w[10], w[11], offset); + w[49] = amd_bytealign_S (w[ 9], w[10], offset); + w[48] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[47] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[46] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[45] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[44] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[43] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[42] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[41] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[40] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[39] = amd_bytealign_S ( 0, w[ 0], offset); w[38] = 0; w[37] = 0; w[36] = 0; @@ -30224,62 +46835,33 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[39] = w[40]; - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 40: - w[63] = amd_bytealign_S (w[23], w[22], offset_minus_4); - w[62] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[61] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[60] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[59] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[58] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[57] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[56] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[55] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[54] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[53] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[52] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[51] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[50] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[49] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[48] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[47] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[46] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[45] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[44] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[43] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[42] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[41] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[40] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[22], w[23], offset); + w[62] = amd_bytealign_S (w[21], w[22], offset); + w[61] = amd_bytealign_S (w[20], w[21], offset); + w[60] = amd_bytealign_S (w[19], w[20], offset); + w[59] = amd_bytealign_S (w[18], w[19], offset); + w[58] = amd_bytealign_S (w[17], w[18], offset); + w[57] = amd_bytealign_S (w[16], w[17], offset); + w[56] = amd_bytealign_S (w[15], w[16], offset); + w[55] = amd_bytealign_S (w[14], w[15], offset); + w[54] = amd_bytealign_S (w[13], w[14], offset); + w[53] = amd_bytealign_S (w[12], w[13], offset); + w[52] = amd_bytealign_S (w[11], w[12], offset); + w[51] = amd_bytealign_S (w[10], w[11], offset); + w[50] = amd_bytealign_S (w[ 9], w[10], offset); + w[49] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[48] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[47] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[46] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[45] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[44] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[43] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[42] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[41] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[40] = amd_bytealign_S ( 0, w[ 0], offset); w[39] = 0; w[38] = 0; w[37] = 0; @@ -30321,60 +46903,32 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[40] = w[41]; - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 41: - w[63] = amd_bytealign_S (w[22], w[21], offset_minus_4); - w[62] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[61] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[60] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[59] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[58] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[57] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[56] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[55] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[54] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[53] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[52] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[51] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[50] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[49] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[48] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[47] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[46] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[45] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[44] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[43] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[42] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[41] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[21], w[22], offset); + w[62] = amd_bytealign_S (w[20], w[21], offset); + w[61] = amd_bytealign_S (w[19], w[20], offset); + w[60] = amd_bytealign_S (w[18], w[19], offset); + w[59] = amd_bytealign_S (w[17], w[18], offset); + w[58] = amd_bytealign_S (w[16], w[17], offset); + w[57] = amd_bytealign_S (w[15], w[16], offset); + w[56] = amd_bytealign_S (w[14], w[15], offset); + w[55] = amd_bytealign_S (w[13], w[14], offset); + w[54] = amd_bytealign_S (w[12], w[13], offset); + w[53] = amd_bytealign_S (w[11], w[12], offset); + w[52] = amd_bytealign_S (w[10], w[11], offset); + w[51] = amd_bytealign_S (w[ 9], w[10], offset); + w[50] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[49] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[48] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[47] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[46] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[45] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[44] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[43] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[42] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[41] = amd_bytealign_S ( 0, w[ 0], offset); w[40] = 0; w[39] = 0; w[38] = 0; @@ -30417,58 +46971,31 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[41] = w[42]; - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 42: - w[63] = amd_bytealign_S (w[21], w[20], offset_minus_4); - w[62] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[61] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[60] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[59] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[58] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[57] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[56] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[55] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[54] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[53] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[52] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[51] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[50] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[49] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[48] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[47] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[46] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[45] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[44] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[43] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[42] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[20], w[21], offset); + w[62] = amd_bytealign_S (w[19], w[20], offset); + w[61] = amd_bytealign_S (w[18], w[19], offset); + w[60] = amd_bytealign_S (w[17], w[18], offset); + w[59] = amd_bytealign_S (w[16], w[17], offset); + w[58] = amd_bytealign_S (w[15], w[16], offset); + w[57] = amd_bytealign_S (w[14], w[15], offset); + w[56] = amd_bytealign_S (w[13], w[14], offset); + w[55] = amd_bytealign_S (w[12], w[13], offset); + w[54] = amd_bytealign_S (w[11], w[12], offset); + w[53] = amd_bytealign_S (w[10], w[11], offset); + w[52] = amd_bytealign_S (w[ 9], w[10], offset); + w[51] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[50] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[49] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[48] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[47] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[46] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[45] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[44] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[43] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[42] = amd_bytealign_S ( 0, w[ 0], offset); w[41] = 0; w[40] = 0; w[39] = 0; @@ -30512,56 +47039,30 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[42] = w[43]; - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 43: - w[63] = amd_bytealign_S (w[20], w[19], offset_minus_4); - w[62] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[61] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[60] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[59] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[58] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[57] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[56] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[55] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[54] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[53] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[52] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[51] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[50] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[49] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[48] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[47] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[46] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[45] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[44] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[43] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[19], w[20], offset); + w[62] = amd_bytealign_S (w[18], w[19], offset); + w[61] = amd_bytealign_S (w[17], w[18], offset); + w[60] = amd_bytealign_S (w[16], w[17], offset); + w[59] = amd_bytealign_S (w[15], w[16], offset); + w[58] = amd_bytealign_S (w[14], w[15], offset); + w[57] = amd_bytealign_S (w[13], w[14], offset); + w[56] = amd_bytealign_S (w[12], w[13], offset); + w[55] = amd_bytealign_S (w[11], w[12], offset); + w[54] = amd_bytealign_S (w[10], w[11], offset); + w[53] = amd_bytealign_S (w[ 9], w[10], offset); + w[52] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[51] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[50] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[49] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[48] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[47] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[46] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[45] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[44] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[43] = amd_bytealign_S ( 0, w[ 0], offset); w[42] = 0; w[41] = 0; w[40] = 0; @@ -30606,54 +47107,29 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[43] = w[44]; - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 44: - w[63] = amd_bytealign_S (w[19], w[18], offset_minus_4); - w[62] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[61] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[60] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[59] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[58] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[57] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[56] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[55] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[54] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[53] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[52] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[51] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[50] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[49] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[48] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[47] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[46] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[45] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[44] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[18], w[19], offset); + w[62] = amd_bytealign_S (w[17], w[18], offset); + w[61] = amd_bytealign_S (w[16], w[17], offset); + w[60] = amd_bytealign_S (w[15], w[16], offset); + w[59] = amd_bytealign_S (w[14], w[15], offset); + w[58] = amd_bytealign_S (w[13], w[14], offset); + w[57] = amd_bytealign_S (w[12], w[13], offset); + w[56] = amd_bytealign_S (w[11], w[12], offset); + w[55] = amd_bytealign_S (w[10], w[11], offset); + w[54] = amd_bytealign_S (w[ 9], w[10], offset); + w[53] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[52] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[51] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[50] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[49] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[48] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[47] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[46] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[45] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[44] = amd_bytealign_S ( 0, w[ 0], offset); w[43] = 0; w[42] = 0; w[41] = 0; @@ -30699,52 +47175,28 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[44] = w[45]; - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 45: - w[63] = amd_bytealign_S (w[18], w[17], offset_minus_4); - w[62] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[61] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[60] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[59] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[58] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[57] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[56] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[55] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[54] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[53] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[52] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[51] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[50] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[49] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[48] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[47] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[46] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[45] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[17], w[18], offset); + w[62] = amd_bytealign_S (w[16], w[17], offset); + w[61] = amd_bytealign_S (w[15], w[16], offset); + w[60] = amd_bytealign_S (w[14], w[15], offset); + w[59] = amd_bytealign_S (w[13], w[14], offset); + w[58] = amd_bytealign_S (w[12], w[13], offset); + w[57] = amd_bytealign_S (w[11], w[12], offset); + w[56] = amd_bytealign_S (w[10], w[11], offset); + w[55] = amd_bytealign_S (w[ 9], w[10], offset); + w[54] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[53] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[52] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[51] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[50] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[49] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[48] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[47] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[46] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[45] = amd_bytealign_S ( 0, w[ 0], offset); w[44] = 0; w[43] = 0; w[42] = 0; @@ -30791,50 +47243,27 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[45] = w[46]; - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 46: - w[63] = amd_bytealign_S (w[17], w[16], offset_minus_4); - w[62] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[61] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[60] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[59] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[58] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[57] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[56] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[55] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[54] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[53] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[52] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[51] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[50] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[49] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[48] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[47] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[46] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[16], w[17], offset); + w[62] = amd_bytealign_S (w[15], w[16], offset); + w[61] = amd_bytealign_S (w[14], w[15], offset); + w[60] = amd_bytealign_S (w[13], w[14], offset); + w[59] = amd_bytealign_S (w[12], w[13], offset); + w[58] = amd_bytealign_S (w[11], w[12], offset); + w[57] = amd_bytealign_S (w[10], w[11], offset); + w[56] = amd_bytealign_S (w[ 9], w[10], offset); + w[55] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[54] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[53] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[52] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[51] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[50] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[49] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[48] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[47] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[46] = amd_bytealign_S ( 0, w[ 0], offset); w[45] = 0; w[44] = 0; w[43] = 0; @@ -30882,48 +47311,26 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[46] = w[47]; - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 47: - w[63] = amd_bytealign_S (w[16], w[15], offset_minus_4); - w[62] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[61] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[60] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[59] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[58] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[57] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[56] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[55] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[54] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[53] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[52] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[51] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[50] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[49] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[48] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[47] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[15], w[16], offset); + w[62] = amd_bytealign_S (w[14], w[15], offset); + w[61] = amd_bytealign_S (w[13], w[14], offset); + w[60] = amd_bytealign_S (w[12], w[13], offset); + w[59] = amd_bytealign_S (w[11], w[12], offset); + w[58] = amd_bytealign_S (w[10], w[11], offset); + w[57] = amd_bytealign_S (w[ 9], w[10], offset); + w[56] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[55] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[54] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[53] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[52] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[51] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[50] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[49] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[48] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[47] = amd_bytealign_S ( 0, w[ 0], offset); w[46] = 0; w[45] = 0; w[44] = 0; @@ -30972,46 +47379,25 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[47] = w[48]; - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 48: - w[63] = amd_bytealign_S (w[15], w[14], offset_minus_4); - w[62] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[61] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[60] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[59] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[58] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[57] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[56] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[55] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[54] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[53] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[52] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[51] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[50] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[49] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[48] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[14], w[15], offset); + w[62] = amd_bytealign_S (w[13], w[14], offset); + w[61] = amd_bytealign_S (w[12], w[13], offset); + w[60] = amd_bytealign_S (w[11], w[12], offset); + w[59] = amd_bytealign_S (w[10], w[11], offset); + w[58] = amd_bytealign_S (w[ 9], w[10], offset); + w[57] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[56] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[55] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[54] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[53] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[52] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[51] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[50] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[49] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[48] = amd_bytealign_S ( 0, w[ 0], offset); w[47] = 0; w[46] = 0; w[45] = 0; @@ -31061,44 +47447,24 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[48] = w[49]; - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 49: - w[63] = amd_bytealign_S (w[14], w[13], offset_minus_4); - w[62] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[61] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[60] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[59] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[58] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[57] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[56] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[55] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[54] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[53] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[52] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[51] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[50] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[49] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[13], w[14], offset); + w[62] = amd_bytealign_S (w[12], w[13], offset); + w[61] = amd_bytealign_S (w[11], w[12], offset); + w[60] = amd_bytealign_S (w[10], w[11], offset); + w[59] = amd_bytealign_S (w[ 9], w[10], offset); + w[58] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[57] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[56] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[55] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[54] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[53] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[52] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[51] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[50] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[49] = amd_bytealign_S ( 0, w[ 0], offset); w[48] = 0; w[47] = 0; w[46] = 0; @@ -31149,42 +47515,23 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[49] = w[50]; - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 50: - w[63] = amd_bytealign_S (w[13], w[12], offset_minus_4); - w[62] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[61] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[60] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[59] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[58] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[57] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[56] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[55] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[54] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[53] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[52] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[51] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[50] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[12], w[13], offset); + w[62] = amd_bytealign_S (w[11], w[12], offset); + w[61] = amd_bytealign_S (w[10], w[11], offset); + w[60] = amd_bytealign_S (w[ 9], w[10], offset); + w[59] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[58] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[57] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[56] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[55] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[54] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[53] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[52] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[51] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[50] = amd_bytealign_S ( 0, w[ 0], offset); w[49] = 0; w[48] = 0; w[47] = 0; @@ -31236,40 +47583,22 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[50] = w[51]; - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 51: - w[63] = amd_bytealign_S (w[12], w[11], offset_minus_4); - w[62] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[61] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[60] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[59] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[58] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[57] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[56] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[55] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[54] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[53] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[52] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[51] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[11], w[12], offset); + w[62] = amd_bytealign_S (w[10], w[11], offset); + w[61] = amd_bytealign_S (w[ 9], w[10], offset); + w[60] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[59] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[58] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[57] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[56] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[55] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[54] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[53] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[52] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[51] = amd_bytealign_S ( 0, w[ 0], offset); w[50] = 0; w[49] = 0; w[48] = 0; @@ -31322,38 +47651,21 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[51] = w[52]; - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 52: - w[63] = amd_bytealign_S (w[11], w[10], offset_minus_4); - w[62] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[61] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[60] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[59] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[58] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[57] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[56] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[55] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[54] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[53] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[52] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[10], w[11], offset); + w[62] = amd_bytealign_S (w[ 9], w[10], offset); + w[61] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[60] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[59] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[58] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[57] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[56] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[55] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[54] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[53] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[52] = amd_bytealign_S ( 0, w[ 0], offset); w[51] = 0; w[50] = 0; w[49] = 0; @@ -31407,36 +47719,20 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[52] = w[53]; - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 53: - w[63] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); - w[62] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[61] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[60] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[59] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[58] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[57] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[56] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[55] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[54] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[53] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 9], w[10], offset); + w[62] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[61] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[60] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[59] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[58] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[57] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[56] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[55] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[54] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[53] = amd_bytealign_S ( 0, w[ 0], offset); w[52] = 0; w[51] = 0; w[50] = 0; @@ -31491,34 +47787,19 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[53] = w[54]; - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 54: - w[63] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); - w[62] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[61] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[60] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[59] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[58] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[57] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[56] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[55] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[54] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[62] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[61] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[60] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[59] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[58] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[57] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[56] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[55] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[54] = amd_bytealign_S ( 0, w[ 0], offset); w[53] = 0; w[52] = 0; w[51] = 0; @@ -31574,32 +47855,18 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[54] = w[55]; - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 55: - w[63] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); - w[62] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[61] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[60] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[59] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[58] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[57] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[56] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[55] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[62] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[61] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[60] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[59] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[58] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[57] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[56] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[55] = amd_bytealign_S ( 0, w[ 0], offset); w[54] = 0; w[53] = 0; w[52] = 0; @@ -31656,30 +47923,17 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[55] = w[56]; - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 56: - w[63] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); - w[62] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[61] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[60] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[59] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[58] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[57] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[56] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[62] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[61] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[60] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[59] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[58] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[57] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[56] = amd_bytealign_S ( 0, w[ 0], offset); w[55] = 0; w[54] = 0; w[53] = 0; @@ -31737,28 +47991,16 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[56] = w[57]; - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 57: - w[63] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); - w[62] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[61] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[60] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[59] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[58] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[57] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[62] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[61] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[60] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[59] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[58] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[57] = amd_bytealign_S ( 0, w[ 0], offset); w[56] = 0; w[55] = 0; w[54] = 0; @@ -31817,26 +48059,15 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[57] = w[58]; - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 58: - w[63] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); - w[62] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[61] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[60] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[59] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[58] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[62] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[61] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[60] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[59] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[58] = amd_bytealign_S ( 0, w[ 0], offset); w[57] = 0; w[56] = 0; w[55] = 0; @@ -31896,24 +48127,14 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[58] = w[59]; - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 59: - w[63] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); - w[62] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[61] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[60] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[59] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[62] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[61] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[60] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[59] = amd_bytealign_S ( 0, w[ 0], offset); w[58] = 0; w[57] = 0; w[56] = 0; @@ -31974,22 +48195,13 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[59] = w[60]; - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 60: - w[63] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); - w[62] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[61] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[60] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[62] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[61] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[60] = amd_bytealign_S ( 0, w[ 0], offset); w[59] = 0; w[58] = 0; w[57] = 0; @@ -32051,20 +48263,12 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[60] = w[61]; - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 61: - w[63] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); - w[62] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[61] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[62] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[61] = amd_bytealign_S ( 0, w[ 0], offset); w[60] = 0; w[59] = 0; w[58] = 0; @@ -32127,18 +48331,11 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[61] = w[62]; - w[62] = w[63]; - w[63] = 0; - } - break; case 62: - w[63] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); - w[62] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[62] = amd_bytealign_S ( 0, w[ 0], offset); w[61] = 0; w[60] = 0; w[59] = 0; @@ -32202,16 +48399,10 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[62] = w[63]; - w[63] = 0; - } - break; case 63: - w[63] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[63] = amd_bytealign_S ( 0, w[ 0], offset); w[62] = 0; w[61] = 0; w[60] = 0; @@ -32276,13 +48467,12 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) w[ 1] = 0; w[ 0] = 0; - if (offset_mod_4 == 0) - { - w[63] = 0; - } - break; } + + #pragma unroll + for (int i = 0; i < 64; i++) w[i] = swap32_S (w[i]); + #endif #ifdef IS_NV diff --git a/OpenCL/inc_rp.cl b/OpenCL/inc_rp.cl index 170ec5385..74ca84cc2 100644 --- a/OpenCL/inc_rp.cl +++ b/OpenCL/inc_rp.cl @@ -760,7 +760,6 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0 const int offset_minus_4 = 4 - offset_mod_4; - #if defined IS_AMD || defined IS_GENERIC u32 s0 = 0; u32 s1 = 0; u32 s2 = 0; @@ -769,64 +768,69 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0 u32 s5 = 0; u32 s6 = 0; u32 s7 = 0; - u32 s8 = 0; + + #if defined IS_AMD || defined IS_GENERIC + const u32 src_r00 = swap32_S (src_r0[0]); + const u32 src_r01 = swap32_S (src_r0[1]); + const u32 src_r02 = swap32_S (src_r0[2]); + const u32 src_r03 = swap32_S (src_r0[3]); + const u32 src_r10 = swap32_S (src_r1[0]); + const u32 src_r11 = swap32_S (src_r1[1]); + const u32 src_r12 = swap32_S (src_r1[2]); + const u32 src_r13 = swap32_S (src_r1[3]); switch (offset / 4) { case 0: - s8 = amd_bytealign_S ( 0, src_r1[3], offset_minus_4); - s7 = amd_bytealign_S (src_r1[3], src_r1[2], offset_minus_4); - s6 = amd_bytealign_S (src_r1[2], src_r1[1], offset_minus_4); - s5 = amd_bytealign_S (src_r1[1], src_r1[0], offset_minus_4); - s4 = amd_bytealign_S (src_r1[0], src_r0[3], offset_minus_4); - s3 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4); - s2 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4); - s1 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4); - s0 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S (src_r12, src_r13, offset); + s6 = amd_bytealign_S (src_r11, src_r12, offset); + s5 = amd_bytealign_S (src_r10, src_r11, offset); + s4 = amd_bytealign_S (src_r03, src_r10, offset); + s3 = amd_bytealign_S (src_r02, src_r03, offset); + s2 = amd_bytealign_S (src_r01, src_r02, offset); + s1 = amd_bytealign_S (src_r00, src_r01, offset); + s0 = amd_bytealign_S ( 0, src_r00, offset); break; case 1: - s8 = amd_bytealign_S ( 0, src_r1[2], offset_minus_4); - s7 = amd_bytealign_S (src_r1[2], src_r1[1], offset_minus_4); - s6 = amd_bytealign_S (src_r1[1], src_r1[0], offset_minus_4); - s5 = amd_bytealign_S (src_r1[0], src_r0[3], offset_minus_4); - s4 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4); - s3 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4); - s2 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4); - s1 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S (src_r11, src_r12, offset); + s6 = amd_bytealign_S (src_r10, src_r11, offset); + s5 = amd_bytealign_S (src_r03, src_r10, offset); + s4 = amd_bytealign_S (src_r02, src_r03, offset); + s3 = amd_bytealign_S (src_r01, src_r02, offset); + s2 = amd_bytealign_S (src_r00, src_r01, offset); + s1 = amd_bytealign_S ( 0, src_r00, offset); s0 = 0; break; case 2: - s8 = amd_bytealign_S ( 0, src_r1[1], offset_minus_4); - s7 = amd_bytealign_S (src_r1[1], src_r1[0], offset_minus_4); - s6 = amd_bytealign_S (src_r1[0], src_r0[3], offset_minus_4); - s5 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4); - s4 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4); - s3 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4); - s2 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S (src_r10, src_r11, offset); + s6 = amd_bytealign_S (src_r03, src_r10, offset); + s5 = amd_bytealign_S (src_r02, src_r03, offset); + s4 = amd_bytealign_S (src_r01, src_r02, offset); + s3 = amd_bytealign_S (src_r00, src_r01, offset); + s2 = amd_bytealign_S ( 0, src_r00, offset); s1 = 0; s0 = 0; break; case 3: - s8 = amd_bytealign_S ( 0, src_r1[0], offset_minus_4); - s7 = amd_bytealign_S (src_r1[0], src_r0[3], offset_minus_4); - s6 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4); - s5 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4); - s4 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4); - s3 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S (src_r03, src_r10, offset); + s6 = amd_bytealign_S (src_r02, src_r03, offset); + s5 = amd_bytealign_S (src_r01, src_r02, offset); + s4 = amd_bytealign_S (src_r00, src_r01, offset); + s3 = amd_bytealign_S ( 0, src_r00, offset); s2 = 0; s1 = 0; s0 = 0; + break; case 4: - s8 = amd_bytealign_S ( 0, src_r0[3], offset_minus_4); - s7 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4); - s6 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4); - s5 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4); - s4 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S (src_r02, src_r03, offset); + s6 = amd_bytealign_S (src_r01, src_r02, offset); + s5 = amd_bytealign_S (src_r00, src_r01, offset); + s4 = amd_bytealign_S ( 0, src_r00, offset); s3 = 0; s2 = 0; s1 = 0; @@ -834,10 +838,9 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0 break; case 5: - s8 = amd_bytealign_S ( 0, src_r0[2], offset_minus_4); - s7 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4); - s6 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4); - s5 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S (src_r01, src_r02, offset); + s6 = amd_bytealign_S (src_r00, src_r01, offset); + s5 = amd_bytealign_S ( 0, src_r00, offset); s4 = 0; s3 = 0; s2 = 0; @@ -846,9 +849,8 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0 break; case 6: - s8 = amd_bytealign_S ( 0, src_r0[1], offset_minus_4); - s7 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4); - s6 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S (src_r00, src_r01, offset); + s6 = amd_bytealign_S ( 0, src_r00, offset); s5 = 0; s4 = 0; s3 = 0; @@ -858,8 +860,7 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0 break; case 7: - s8 = amd_bytealign_S ( 0, src_r0[0], offset_minus_4); - s7 = amd_bytealign_S (src_r0[0], 0, offset_minus_4); + s7 = amd_bytealign_S ( 0, src_r00, offset); s6 = 0; s5 = 0; s4 = 0; @@ -870,83 +871,69 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0 break; } - if (offset_mod_4 == 0) - { - buf0[0] = src_l0[0] | s1; - buf0[1] = src_l0[1] | s2; - buf0[2] = src_l0[2] | s3; - buf0[3] = src_l0[3] | s4; - buf1[0] = src_l1[0] | s5; - buf1[1] = src_l1[1] | s6; - buf1[2] = src_l1[2] | s7; - buf1[3] = src_l1[3] | s8; - } - else - { - buf0[0] = src_l0[0] | s0; - buf0[1] = src_l0[1] | s1; - buf0[2] = src_l0[2] | s2; - buf0[3] = src_l0[3] | s3; - buf1[0] = src_l1[0] | s4; - buf1[1] = src_l1[1] | s5; - buf1[2] = src_l1[2] | s6; - buf1[3] = src_l1[3] | s7; - } + s0 = swap32_S (s0); + s1 = swap32_S (s1); + s2 = swap32_S (s2); + s3 = swap32_S (s3); + s4 = swap32_S (s4); + s5 = swap32_S (s5); + s6 = swap32_S (s6); + s7 = swap32_S (s7); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - u32 s0 = 0; - u32 s1 = 0; - u32 s2 = 0; - u32 s3 = 0; - u32 s4 = 0; - u32 s5 = 0; - u32 s6 = 0; - u32 s7 = 0; + const u32 src_r00 = src_r0[0]; + const u32 src_r01 = src_r0[1]; + const u32 src_r02 = src_r0[2]; + const u32 src_r03 = src_r0[3]; + const u32 src_r10 = src_r1[0]; + const u32 src_r11 = src_r1[1]; + const u32 src_r12 = src_r1[2]; + const u32 src_r13 = src_r1[3]; switch (offset / 4) { case 0: - s7 = __byte_perm_S (src_r1[2], src_r1[3], selector); - s6 = __byte_perm_S (src_r1[1], src_r1[2], selector); - s5 = __byte_perm_S (src_r1[0], src_r1[1], selector); - s4 = __byte_perm_S (src_r0[3], src_r1[0], selector); - s3 = __byte_perm_S (src_r0[2], src_r0[3], selector); - s2 = __byte_perm_S (src_r0[1], src_r0[2], selector); - s1 = __byte_perm_S (src_r0[0], src_r0[1], selector); - s0 = __byte_perm_S ( 0, src_r0[0], selector); + s7 = __byte_perm_S (src_r12, src_r13, selector); + s6 = __byte_perm_S (src_r11, src_r12, selector); + s5 = __byte_perm_S (src_r10, src_r11, selector); + s4 = __byte_perm_S (src_r03, src_r10, selector); + s3 = __byte_perm_S (src_r02, src_r03, selector); + s2 = __byte_perm_S (src_r01, src_r02, selector); + s1 = __byte_perm_S (src_r00, src_r01, selector); + s0 = __byte_perm_S ( 0, src_r00, selector); break; case 1: - s7 = __byte_perm_S (src_r1[1], src_r1[2], selector); - s6 = __byte_perm_S (src_r1[0], src_r1[1], selector); - s5 = __byte_perm_S (src_r0[3], src_r1[0], selector); - s4 = __byte_perm_S (src_r0[2], src_r0[3], selector); - s3 = __byte_perm_S (src_r0[1], src_r0[2], selector); - s2 = __byte_perm_S (src_r0[0], src_r0[1], selector); - s1 = __byte_perm_S ( 0, src_r0[0], selector); + s7 = __byte_perm_S (src_r11, src_r12, selector); + s6 = __byte_perm_S (src_r10, src_r11, selector); + s5 = __byte_perm_S (src_r03, src_r10, selector); + s4 = __byte_perm_S (src_r02, src_r03, selector); + s3 = __byte_perm_S (src_r01, src_r02, selector); + s2 = __byte_perm_S (src_r00, src_r01, selector); + s1 = __byte_perm_S ( 0, src_r00, selector); s0 = 0; break; case 2: - s7 = __byte_perm_S (src_r1[0], src_r1[1], selector); - s6 = __byte_perm_S (src_r0[3], src_r1[0], selector); - s5 = __byte_perm_S (src_r0[2], src_r0[3], selector); - s4 = __byte_perm_S (src_r0[1], src_r0[2], selector); - s3 = __byte_perm_S (src_r0[0], src_r0[1], selector); - s2 = __byte_perm_S ( 0, src_r0[0], selector); + s7 = __byte_perm_S (src_r10, src_r11, selector); + s6 = __byte_perm_S (src_r03, src_r10, selector); + s5 = __byte_perm_S (src_r02, src_r03, selector); + s4 = __byte_perm_S (src_r01, src_r02, selector); + s3 = __byte_perm_S (src_r00, src_r01, selector); + s2 = __byte_perm_S ( 0, src_r00, selector); s1 = 0; s0 = 0; break; case 3: - s7 = __byte_perm_S (src_r0[3], src_r1[0], selector); - s6 = __byte_perm_S (src_r0[2], src_r0[3], selector); - s5 = __byte_perm_S (src_r0[1], src_r0[2], selector); - s4 = __byte_perm_S (src_r0[0], src_r0[1], selector); - s3 = __byte_perm_S ( 0, src_r0[0], selector); + s7 = __byte_perm_S (src_r03, src_r10, selector); + s6 = __byte_perm_S (src_r02, src_r03, selector); + s5 = __byte_perm_S (src_r01, src_r02, selector); + s4 = __byte_perm_S (src_r00, src_r01, selector); + s3 = __byte_perm_S ( 0, src_r00, selector); s2 = 0; s1 = 0; s0 = 0; @@ -954,10 +941,10 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0 break; case 4: - s7 = __byte_perm_S (src_r0[2], src_r0[3], selector); - s6 = __byte_perm_S (src_r0[1], src_r0[2], selector); - s5 = __byte_perm_S (src_r0[0], src_r0[1], selector); - s4 = __byte_perm_S ( 0, src_r0[0], selector); + s7 = __byte_perm_S (src_r02, src_r03, selector); + s6 = __byte_perm_S (src_r01, src_r02, selector); + s5 = __byte_perm_S (src_r00, src_r01, selector); + s4 = __byte_perm_S ( 0, src_r00, selector); s3 = 0; s2 = 0; s1 = 0; @@ -965,9 +952,9 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0 break; case 5: - s7 = __byte_perm_S (src_r0[1], src_r0[2], selector); - s6 = __byte_perm_S (src_r0[0], src_r0[1], selector); - s5 = __byte_perm_S ( 0, src_r0[0], selector); + s7 = __byte_perm_S (src_r01, src_r02, selector); + s6 = __byte_perm_S (src_r00, src_r01, selector); + s5 = __byte_perm_S ( 0, src_r00, selector); s4 = 0; s3 = 0; s2 = 0; @@ -976,8 +963,8 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0 break; case 6: - s7 = __byte_perm_S (src_r0[0], src_r0[1], selector); - s6 = __byte_perm_S ( 0, src_r0[0], selector); + s7 = __byte_perm_S (src_r00, src_r01, selector); + s6 = __byte_perm_S ( 0, src_r00, selector); s5 = 0; s4 = 0; s3 = 0; @@ -987,7 +974,7 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0 break; case 7: - s7 = __byte_perm_S ( 0, src_r0[0], selector); + s7 = __byte_perm_S ( 0, src_r00, selector); s6 = 0; s5 = 0; s4 = 0; @@ -997,6 +984,7 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0 s0 = 0; break; } + #endif buf0[0] = src_l0[0] | s0; buf0[1] = src_l0[1] | s1; @@ -1006,8 +994,6 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0 buf1[1] = src_l1[1] | s5; buf1[2] = src_l1[2] | s6; buf1[3] = src_l1[3] | s7; - - #endif } void reverse_block (u32 in0[4], u32 in1[4], u32 out0[4], u32 out1[4], const u32 len) diff --git a/OpenCL/m00500-optimized.cl b/OpenCL/m00500-optimized.cl index cb89b0b7b..5ccb727aa 100644 --- a/OpenCL/m00500-optimized.cl +++ b/OpenCL/m00500-optimized.cl @@ -10,110 +10,13 @@ #include "inc_hash_functions.cl" #include "inc_types.cl" #include "inc_common.cl" +#include "inc_hash_md5.cl" #define COMPARE_S "inc_comp_single.cl" #define COMPARE_M "inc_comp_multi.cl" #define md5crypt_magic 0x00243124u -void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[4]) { u32 tmp0; @@ -127,30 +30,37 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, 0, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, 0, selector); #endif const u32 div = offset / 4; @@ -233,30 +143,39 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0x80, append[3], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); + u32 in4 = 0x80000000; - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0x80; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, in4, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0x80, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + u32 in4 = 0x80; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, in4, selector); #endif const u32 div = offset / 4; @@ -337,24 +256,27 @@ void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign ( 0, append[1], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, 0, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, 0, selector); #endif const u32 div = offset / 4; diff --git a/OpenCL/m01600-optimized.cl b/OpenCL/m01600-optimized.cl index d624b4678..af6203263 100644 --- a/OpenCL/m01600-optimized.cl +++ b/OpenCL/m01600-optimized.cl @@ -8,6 +8,7 @@ #include "inc_hash_functions.cl" #include "inc_types.cl" #include "inc_common.cl" +#include "inc_hash_md5.cl" #define COMPARE_S "inc_comp_single.cl" #define COMPARE_M "inc_comp_multi.cl" @@ -15,104 +16,6 @@ #define md5apr1_magic0 0x72706124u #define md5apr1_magic1 0x00002431u -void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[4]) { u32 tmp0; @@ -126,30 +29,37 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, 0, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, 0, selector); #endif const u32 div = offset / 4; @@ -232,30 +142,39 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0x80, append[3], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); + u32 in4 = 0x80000000; - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0x80; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, in4, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0x80, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + u32 in4 = 0x80; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, in4, selector); #endif const u32 div = offset / 4; @@ -336,24 +255,27 @@ void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign ( 0, append[1], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, 0, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, 0, selector); #endif const u32 div = offset / 4; diff --git a/OpenCL/m02810_a3.cl b/OpenCL/m02810_a3.cl index 48fa8911f..f46abd13c 100644 --- a/OpenCL/m02810_a3.cl +++ b/OpenCL/m02810_a3.cl @@ -62,7 +62,7 @@ __kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 pw_lenv = ceil ((float) pw_len / 4); - u32 w[64] = { 0 }; + u32x w[64] = { 0 }; for (int idx = 0; idx < pw_lenv; idx++) { @@ -88,13 +88,13 @@ __kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rule * loop */ - u32 w0l = w[0]; + u32x w0l = w[0]; for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos / VECT_SIZE]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0lr = w0l | w0r; + const u32x w0lr = w0l | w0r; w[0] = w0lr; @@ -106,10 +106,10 @@ __kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rule md5_final_vector (&ctx0); - const u32 a = ctx0.h[0]; - const u32 b = ctx0.h[1]; - const u32 c = ctx0.h[2]; - const u32 d = ctx0.h[3]; + const u32x a = ctx0.h[0]; + const u32x b = ctx0.h[1]; + const u32x c = ctx0.h[2]; + const u32x d = ctx0.h[3]; md5_ctx_vector_t ctx; @@ -161,10 +161,10 @@ __kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rule md5_transform_vector (ctx.w0, ctx.w1, ctx.w2, ctx.w3, ctx.h); - const u32 r0 = ctx.h[DGST_R0]; - const u32 r1 = ctx.h[DGST_R1]; - const u32 r2 = ctx.h[DGST_R2]; - const u32 r3 = ctx.h[DGST_R3]; + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; COMPARE_M_SIMD (r0, r1, r2, r3); } @@ -219,7 +219,7 @@ __kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rule const u32 pw_lenv = ceil ((float) pw_len / 4); - u32 w[64] = { 0 }; + u32x w[64] = { 0 }; for (int idx = 0; idx < pw_lenv; idx++) { @@ -245,13 +245,13 @@ __kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rule * loop */ - u32 w0l = w[0]; + u32x w0l = w[0]; for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos / VECT_SIZE]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0lr = w0l | w0r; + const u32x w0lr = w0l | w0r; w[0] = w0lr; @@ -263,10 +263,10 @@ __kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rule md5_final_vector (&ctx0); - const u32 a = ctx0.h[0]; - const u32 b = ctx0.h[1]; - const u32 c = ctx0.h[2]; - const u32 d = ctx0.h[3]; + const u32x a = ctx0.h[0]; + const u32x b = ctx0.h[1]; + const u32x c = ctx0.h[2]; + const u32x d = ctx0.h[3]; md5_ctx_vector_t ctx; @@ -318,10 +318,10 @@ __kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rule md5_transform_vector (ctx.w0, ctx.w1, ctx.w2, ctx.w3, ctx.h); - const u32 r0 = ctx.h[DGST_R0]; - const u32 r1 = ctx.h[DGST_R1]; - const u32 r2 = ctx.h[DGST_R2]; - const u32 r3 = ctx.h[DGST_R3]; + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; COMPARE_S_SIMD (r0, r1, r2, r3); } diff --git a/OpenCL/m05800-optimized.cl b/OpenCL/m05800-optimized.cl index 774b97327..760557daa 100644 --- a/OpenCL/m05800-optimized.cl +++ b/OpenCL/m05800-optimized.cl @@ -8,6 +8,7 @@ #include "inc_hash_functions.cl" #include "inc_types.cl" #include "inc_common.cl" +#include "inc_hash_sha1.cl" #define COMPARE_S "inc_comp_single.cl" #define COMPARE_M "inc_comp_multi.cl" @@ -2116,33 +2117,42 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3 const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign (append[4], append[3], offset_minus_4); - tmp5 = amd_bytealign ( 0, append[4], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); + u32 in4 = swap32_S (append[4]); - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = tmp5; - tmp5 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, in4, offset); + tmp5 = amd_bytealign (in4, 0, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); + tmp5 = swap32_S (tmp5); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], append[4], selector); - tmp5 = __byte_perm (append[4], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + u32 in4 = append[4]; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, in4, selector); + tmp5 = __byte_perm (in4, 0, selector); #endif const u32 div = offset / 4; @@ -2187,134 +2197,6 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3 } } -void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) -{ - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} - __kernel void m05800_init (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global androidpin_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** diff --git a/OpenCL/m05800.cl b/OpenCL/m05800.cl index ec03f6218..a447bf79e 100644 --- a/OpenCL/m05800.cl +++ b/OpenCL/m05800.cl @@ -2117,33 +2117,42 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3 const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign (append[4], append[3], offset_minus_4); - tmp5 = amd_bytealign ( 0, append[4], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); + u32 in4 = swap32_S (append[4]); - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = tmp5; - tmp5 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, in4, offset); + tmp5 = amd_bytealign (in4, 0, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); + tmp5 = swap32_S (tmp5); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], append[4], selector); - tmp5 = __byte_perm (append[4], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + u32 in4 = append[4]; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp3 = __byte_perm (in3, in4, selector); + tmp4 = __byte_perm (in4, 0, selector); #endif const u32 div = offset / 4; @@ -2188,134 +2197,6 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3 } } -void orig_sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) -{ - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; - - #undef K - #define K SHA1C00 - - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t); - SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t); - SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t); - SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t); - SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t); - SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t); - - #undef K - #define K SHA1C01 - - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t); - - #undef K - #define K SHA1C02 - - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t); - - #undef K - #define K SHA1C03 - - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} - __kernel void m05800_init (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global androidpin_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { const u32 gid = get_global_id (0); diff --git a/OpenCL/m06300-optimized.cl b/OpenCL/m06300-optimized.cl index 6b3980270..a0c85c49c 100644 --- a/OpenCL/m06300-optimized.cl +++ b/OpenCL/m06300-optimized.cl @@ -8,108 +8,11 @@ #include "inc_hash_functions.cl" #include "inc_types.cl" #include "inc_common.cl" +#include "inc_hash_md5.cl" #define COMPARE_S "inc_comp_single.cl" #define COMPARE_M "inc_comp_multi.cl" -void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; -} - void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[4]) { u32 tmp0; @@ -123,30 +26,37 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, 0, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, 0, selector); #endif const u32 div = offset / 4; @@ -229,30 +139,39 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0x80, append[3], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); + u32 in4 = 0x80000000; - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0x80; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, in4, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0x80, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + u32 in4 = 0x80; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, in4, selector); #endif const u32 div = offset / 4; @@ -333,24 +252,27 @@ void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign ( 0, append[1], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, 0, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, 0, selector); #endif const u32 div = offset / 4; diff --git a/OpenCL/m07400-optimized.cl b/OpenCL/m07400-optimized.cl index 3483c57ec..a52655b1b 100644 --- a/OpenCL/m07400-optimized.cl +++ b/OpenCL/m07400-optimized.cl @@ -32,8 +32,6 @@ __constant u32a k_sha256[64] = SHA256C3c, SHA256C3d, SHA256C3e, SHA256C3f, }; -#if 1 - void sha256_transform (const u32 w[16], u32 digest[8]) { u32 a = digest[0]; @@ -203,30 +201,37 @@ u32 memcat16 (u32 block[16], const u32 offset, const u32 append[4], const u32 ap const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, 0, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, 0, selector); #endif switch (offset / 4) @@ -337,30 +342,37 @@ u32 memcat16c (u32 block[16], const u32 offset, const u32 append[4], const u32 a const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, 0, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, 0, selector); #endif u32 carry[4] = { 0, 0, 0, 0 }; @@ -484,7 +496,7 @@ u32 memcat16c (u32 block[16], const u32 offset, const u32 append[4], const u32 a return new_len; } -u32 memcat20 (u32 block[20], const u32 offset, const u32 append[4], const u32 append_len) +u32 memcat20 (u32 block[32], const u32 offset, const u32 append[4], const u32 append_len) { u32 tmp0; u32 tmp1; @@ -497,30 +509,37 @@ u32 memcat20 (u32 block[20], const u32 offset, const u32 append[4], const u32 ap const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, 0, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, 0, selector); #endif switch (offset / 4) @@ -626,7 +645,7 @@ u32 memcat20 (u32 block[20], const u32 offset, const u32 append[4], const u32 ap return offset + append_len; } -u32 memcat20_x80 (u32 block[20], const u32 offset, const u32 append[4], const u32 append_len) +u32 memcat20_x80 (u32 block[32], const u32 offset, const u32 append[4], const u32 append_len) { u32 tmp0; u32 tmp1; @@ -639,30 +658,39 @@ u32 memcat20_x80 (u32 block[20], const u32 offset, const u32 append[4], const u3 const int offset_minus_4 = 4 - offset_mod_4; #if defined IS_AMD || defined IS_GENERIC - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); - tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); - tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); - tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); - tmp4 = amd_bytealign ( 0x80, append[3], offset_minus_4); + u32 in0 = swap32_S (append[0]); + u32 in1 = swap32_S (append[1]); + u32 in2 = swap32_S (append[2]); + u32 in3 = swap32_S (append[3]); + u32 in4 = 0x80000000; - if (offset_mod_4 == 0) - { - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = 0x80; - } + tmp0 = amd_bytealign ( 0, in0, offset); + tmp1 = amd_bytealign (in0, in1, offset); + tmp2 = amd_bytealign (in1, in2, offset); + tmp3 = amd_bytealign (in2, in3, offset); + tmp4 = amd_bytealign (in3, in4, offset); + + tmp0 = swap32_S (tmp0); + tmp1 = swap32_S (tmp1); + tmp2 = swap32_S (tmp2); + tmp3 = swap32_S (tmp3); + tmp4 = swap32_S (tmp4); #endif #ifdef IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - tmp0 = __byte_perm ( 0, append[0], selector); - tmp1 = __byte_perm (append[0], append[1], selector); - tmp2 = __byte_perm (append[1], append[2], selector); - tmp3 = __byte_perm (append[2], append[3], selector); - tmp4 = __byte_perm (append[3], 0x80, selector); + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + u32 in4 = 0x80; + + tmp0 = __byte_perm ( 0, in0, selector); + tmp1 = __byte_perm (in0, in1, selector); + tmp2 = __byte_perm (in1, in2, selector); + tmp3 = __byte_perm (in2, in3, selector); + tmp4 = __byte_perm (in3, in4, selector); #endif switch (offset / 4) @@ -1201,543 +1229,3 @@ __kernel void m07400_comp (__global pw_t *pws, __global const kernel_rule_t *rul #include COMPARE_M } - -#else - -// this is basically a much cleaner version, but apparently drops speeds by over 100% :( - -#define PUTCHAR32_BE(a,p,c) ((u8 *)(a))[(p) ^ 3] = (u8) (c) -#define GETCHAR32_BE(a,p) ((u8 *)(a))[(p) ^ 3] - -typedef struct -{ - u32 state[8]; - u32 buf[32]; - int len; - -} sha256_ctx_t; - -void sha256_transform (const u32 w[16], u32 digest[8]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - u32 e = digest[4]; - u32 f = digest[5]; - u32 g = digest[6]; - u32 h = digest[7]; - - u32 w0_t = w[ 0]; - u32 w1_t = w[ 1]; - u32 w2_t = w[ 2]; - u32 w3_t = w[ 3]; - u32 w4_t = w[ 4]; - u32 w5_t = w[ 5]; - u32 w6_t = w[ 6]; - u32 w7_t = w[ 7]; - u32 w8_t = w[ 8]; - u32 w9_t = w[ 9]; - u32 wa_t = w[10]; - u32 wb_t = w[11]; - u32 wc_t = w[12]; - u32 wd_t = w[13]; - u32 we_t = w[14]; - u32 wf_t = w[15]; - - #define ROUND_EXPAND() \ - { \ - w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t); \ - w1_t = SHA256_EXPAND (wf_t, wa_t, w2_t, w1_t); \ - w2_t = SHA256_EXPAND (w0_t, wb_t, w3_t, w2_t); \ - w3_t = SHA256_EXPAND (w1_t, wc_t, w4_t, w3_t); \ - w4_t = SHA256_EXPAND (w2_t, wd_t, w5_t, w4_t); \ - w5_t = SHA256_EXPAND (w3_t, we_t, w6_t, w5_t); \ - w6_t = SHA256_EXPAND (w4_t, wf_t, w7_t, w6_t); \ - w7_t = SHA256_EXPAND (w5_t, w0_t, w8_t, w7_t); \ - w8_t = SHA256_EXPAND (w6_t, w1_t, w9_t, w8_t); \ - w9_t = SHA256_EXPAND (w7_t, w2_t, wa_t, w9_t); \ - wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); \ - wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); \ - wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); \ - wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); \ - we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); \ - wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); \ - } - - #define ROUND_STEP(i) \ - { \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, k_sha256[i + 0]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, k_sha256[i + 1]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, k_sha256[i + 2]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, k_sha256[i + 3]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, k_sha256[i + 4]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, k_sha256[i + 5]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, k_sha256[i + 6]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, k_sha256[i + 7]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, k_sha256[i + 8]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, k_sha256[i + 9]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, k_sha256[i + 10]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, k_sha256[i + 11]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, k_sha256[i + 12]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, k_sha256[i + 13]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, k_sha256[i + 14]); \ - SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, k_sha256[i + 15]); \ - } - - ROUND_STEP (0); - - #ifdef _unroll - #pragma unroll - #endif - for (int i = 16; i < 64; i += 16) - { - ROUND_EXPAND (); ROUND_STEP (i); - } - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; - digest[4] += e; - digest[5] += f; - digest[6] += g; - digest[7] += h; -} - -void sha256_init (sha256_ctx_t *sha256_ctx) -{ - sha256_ctx->state[0] = SHA256M_A; - sha256_ctx->state[1] = SHA256M_B; - sha256_ctx->state[2] = SHA256M_C; - sha256_ctx->state[3] = SHA256M_D; - sha256_ctx->state[4] = SHA256M_E; - sha256_ctx->state[5] = SHA256M_F; - sha256_ctx->state[6] = SHA256M_G; - sha256_ctx->state[7] = SHA256M_H; - - sha256_ctx->len = 0; -} - -void sha256_update (sha256_ctx_t *sha256_ctx, const u32 *buf, int len) -{ - int pos = sha256_ctx->len & 0x3f; - - sha256_ctx->len += len; - - if ((pos + len) < 64) - { - for (int i = 0; i < len; i++) - { - PUTCHAR32_BE (sha256_ctx->buf, pos++, GETCHAR32_BE (buf, i)); - } - - return; - } - - int cnt = 64 - pos; - - for (int i = 0; i < cnt; i++) - { - PUTCHAR32_BE (sha256_ctx->buf, pos++, GETCHAR32_BE (buf, i)); - } - - sha256_transform (sha256_ctx->buf, sha256_ctx->state); - - len -= cnt; - - for (int i = 0; i < len; i++) - { - PUTCHAR32_BE (sha256_ctx->buf, i, GETCHAR32_BE (buf, cnt + i)); - } -} - -void sha256_final (sha256_ctx_t *sha256_ctx) -{ - int pos = sha256_ctx->len & 0x3f; - - for (int i = pos; i < 64; i++) - { - PUTCHAR32_BE (sha256_ctx->buf, i, 0); - } - - PUTCHAR32_BE (sha256_ctx->buf, pos, 0x80); - - if (pos >= 56) - { - sha256_transform (sha256_ctx->buf, sha256_ctx->state); - - sha256_ctx->buf[ 0] = 0; - sha256_ctx->buf[ 1] = 0; - sha256_ctx->buf[ 2] = 0; - sha256_ctx->buf[ 3] = 0; - sha256_ctx->buf[ 4] = 0; - sha256_ctx->buf[ 5] = 0; - sha256_ctx->buf[ 6] = 0; - sha256_ctx->buf[ 7] = 0; - sha256_ctx->buf[ 8] = 0; - sha256_ctx->buf[ 9] = 0; - sha256_ctx->buf[10] = 0; - sha256_ctx->buf[11] = 0; - sha256_ctx->buf[12] = 0; - sha256_ctx->buf[13] = 0; - sha256_ctx->buf[14] = 0; - sha256_ctx->buf[15] = 0; - } - - sha256_ctx->buf[15] = sha256_ctx->len * 8; - - sha256_transform (sha256_ctx->buf, sha256_ctx->state); -} - -__kernel void m07400_init (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global sha256crypt_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * base - */ - - const u32 gid = get_global_id (0); - - if (gid >= gid_max) return; - - u32 pw[4]; - - pw[0] = swap32 (pws[gid].i[0]); - pw[1] = swap32 (pws[gid].i[1]); - pw[2] = swap32 (pws[gid].i[2]); - pw[3] = swap32 (pws[gid].i[3]); - - const u32 pw_len = pws[gid].pw_len; - - /** - * salt - */ - - u32 salt[4]; - - salt[0] = swap32 (salt_bufs[salt_pos].salt_buf[0]); - salt[1] = swap32 (salt_bufs[salt_pos].salt_buf[1]); - salt[2] = swap32 (salt_bufs[salt_pos].salt_buf[2]); - salt[3] = swap32 (salt_bufs[salt_pos].salt_buf[3]); - - u32 salt_len = salt_bufs[salt_pos].salt_len; - - /** - * begin - */ - - sha256_ctx_t sha256_ctx; - - sha256_init (&sha256_ctx); - - sha256_update (&sha256_ctx, pw, pw_len); - sha256_update (&sha256_ctx, salt, salt_len); - sha256_update (&sha256_ctx, pw, pw_len); - - sha256_final (&sha256_ctx); - - u32 tmp[8]; - - tmp[0] = sha256_ctx.state[0]; - tmp[1] = sha256_ctx.state[1]; - tmp[2] = sha256_ctx.state[2]; - tmp[3] = sha256_ctx.state[3]; - tmp[4] = sha256_ctx.state[4]; - tmp[5] = sha256_ctx.state[5]; - tmp[6] = sha256_ctx.state[6]; - tmp[7] = sha256_ctx.state[7]; - - sha256_init (&sha256_ctx); - - sha256_update (&sha256_ctx, pw, pw_len); - sha256_update (&sha256_ctx, salt, salt_len); - sha256_update (&sha256_ctx, tmp, pw_len); - - for (u32 j = pw_len; j; j >>= 1) - { - if (j & 1) - { - sha256_update (&sha256_ctx, tmp, 32); - } - else - { - sha256_update (&sha256_ctx, pw, pw_len); - } - } - - sha256_final (&sha256_ctx); - - tmps[gid].alt_result[0] = sha256_ctx.state[0]; - tmps[gid].alt_result[1] = sha256_ctx.state[1]; - tmps[gid].alt_result[2] = sha256_ctx.state[2]; - tmps[gid].alt_result[3] = sha256_ctx.state[3]; - tmps[gid].alt_result[4] = sha256_ctx.state[4]; - tmps[gid].alt_result[5] = sha256_ctx.state[5]; - tmps[gid].alt_result[6] = sha256_ctx.state[6]; - tmps[gid].alt_result[7] = sha256_ctx.state[7]; - - // p_bytes - - sha256_init (&sha256_ctx); - - for (u32 j = 0; j < pw_len; j++) - { - sha256_update (&sha256_ctx, pw, pw_len); - } - - sha256_final (&sha256_ctx); - - tmps[gid].p_bytes[0] = sha256_ctx.state[0]; - tmps[gid].p_bytes[1] = sha256_ctx.state[1]; - tmps[gid].p_bytes[2] = sha256_ctx.state[2]; - tmps[gid].p_bytes[3] = sha256_ctx.state[3]; - - // s_bytes - - sha256_init (&sha256_ctx); - - for (u32 j = 0; j < 16 + ((tmps[gid].alt_result[0] >> 24) & 0xff); j++) - { - sha256_update (&sha256_ctx, salt, salt_len); - } - - sha256_final (&sha256_ctx); - - tmps[gid].s_bytes[0] = sha256_ctx.state[0]; - tmps[gid].s_bytes[1] = sha256_ctx.state[1]; - tmps[gid].s_bytes[2] = sha256_ctx.state[2]; - tmps[gid].s_bytes[3] = sha256_ctx.state[3]; -} - -__kernel void m07400_loop (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global sha256crypt_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * base - */ - - const u32 gid = get_global_id (0); - - if (gid >= gid_max) return; - - u32 p_bytes0[4]; - - p_bytes0[0] = tmps[gid].p_bytes[0]; - p_bytes0[1] = tmps[gid].p_bytes[1]; - p_bytes0[2] = tmps[gid].p_bytes[2]; - p_bytes0[3] = tmps[gid].p_bytes[3]; - - const u32 pw_len = pws[gid].pw_len; - - u32 s_bytes0[4]; - - s_bytes0[0] = tmps[gid].s_bytes[0]; - s_bytes0[1] = tmps[gid].s_bytes[1]; - s_bytes0[2] = tmps[gid].s_bytes[2]; - s_bytes0[3] = tmps[gid].s_bytes[3]; - - const u32 salt_len = salt_bufs[salt_pos].salt_len; - - u32 wpc_len[8]; - - wpc_len[0] = 32 + 0 + 0 + pw_len; - wpc_len[1] = pw_len + 0 + 0 + 32; - wpc_len[2] = 32 + salt_len + 0 + pw_len; - wpc_len[3] = pw_len + salt_len + 0 + 32; - wpc_len[4] = 32 + 0 + pw_len + pw_len; - wpc_len[5] = pw_len + 0 + pw_len + 32; - wpc_len[6] = 32 + salt_len + pw_len + pw_len; - wpc_len[7] = pw_len + salt_len + pw_len + 32; - - u32 wpc[8][32] = { { 0 } }; - - for (u32 i = 0; i < 8; i++) - { - u32 block_len = 0; - - if (i & 1) - { - for (u32 j = 0; j < pw_len; j++) - { - PUTCHAR32_BE (wpc[i], block_len++, GETCHAR32_BE (p_bytes0, j)); - } - } - else - { - block_len += 32; - } - - if (i & 2) - { - for (u32 j = 0; j < salt_len; j++) - { - PUTCHAR32_BE (wpc[i], block_len++, GETCHAR32_BE (s_bytes0, j)); - } - } - - if (i & 4) - { - for (u32 j = 0; j < pw_len; j++) - { - PUTCHAR32_BE (wpc[i], block_len++, GETCHAR32_BE (p_bytes0, j)); - } - } - - if (i & 1) - { - block_len += 32; - } - else - { - for (u32 j = 0; j < pw_len; j++) - { - PUTCHAR32_BE (wpc[i], block_len++, GETCHAR32_BE (p_bytes0, j)); - } - } - - PUTCHAR32_BE (wpc[i], block_len, 0x80); - - if (block_len < 56) - { - wpc[i][15] = block_len * 8; - } - else - { - wpc[i][31] = block_len * 8; - } - } - - /** - * base - */ - - u32 alt_result[8]; - - alt_result[0] = tmps[gid].alt_result[0]; - alt_result[1] = tmps[gid].alt_result[1]; - alt_result[2] = tmps[gid].alt_result[2]; - alt_result[3] = tmps[gid].alt_result[3]; - alt_result[4] = tmps[gid].alt_result[4]; - alt_result[5] = tmps[gid].alt_result[5]; - alt_result[6] = tmps[gid].alt_result[6]; - alt_result[7] = tmps[gid].alt_result[7]; - - /* Repeatedly run the collected hash value through SHA256 to burn - CPU cycles. */ - - for (u32 i = 0, j = loop_pos; i < loop_cnt; i++, j++) - { - const u32 j1 = (j & 1) ? 1 : 0; - const u32 j3 = (j % 3) ? 2 : 0; - const u32 j7 = (j % 7) ? 4 : 0; - - const u32 pc = j1 + j3 + j7; - - u32 block[32]; - - block[ 0] = wpc[pc][ 0]; - block[ 1] = wpc[pc][ 1]; - block[ 2] = wpc[pc][ 2]; - block[ 3] = wpc[pc][ 3]; - block[ 4] = wpc[pc][ 4]; - block[ 5] = wpc[pc][ 5]; - block[ 6] = wpc[pc][ 6]; - block[ 7] = wpc[pc][ 7]; - block[ 8] = wpc[pc][ 8]; - block[ 9] = wpc[pc][ 9]; - block[10] = wpc[pc][10]; - block[11] = wpc[pc][11]; - block[12] = wpc[pc][12]; - block[13] = wpc[pc][13]; - block[14] = wpc[pc][14]; - block[15] = wpc[pc][15]; - block[16] = wpc[pc][16]; - block[17] = wpc[pc][17]; - block[18] = wpc[pc][18]; - block[19] = wpc[pc][19]; - block[20] = wpc[pc][20]; - block[21] = wpc[pc][21]; - block[22] = wpc[pc][22]; - block[23] = wpc[pc][23]; - block[24] = wpc[pc][24]; - block[25] = wpc[pc][25]; - block[26] = wpc[pc][26]; - block[27] = wpc[pc][27]; - block[28] = wpc[pc][28]; - block[29] = wpc[pc][29]; - block[30] = wpc[pc][30]; - block[31] = wpc[pc][31]; - - const u32 block_len = wpc_len[pc]; - - if (j1) - { - #ifdef _unroll - #pragma unroll - #endif - for (u32 k = 0, p = block_len - 32; k < 32; k++, p++) - { - PUTCHAR32_BE (block, p, GETCHAR32_BE (alt_result, k)); - } - } - else - { - block[0] = alt_result[0]; - block[1] = alt_result[1]; - block[2] = alt_result[2]; - block[3] = alt_result[3]; - block[4] = alt_result[4]; - block[5] = alt_result[5]; - block[6] = alt_result[6]; - block[7] = alt_result[7]; - } - - alt_result[0] = SHA256M_A; - alt_result[1] = SHA256M_B; - alt_result[2] = SHA256M_C; - alt_result[3] = SHA256M_D; - alt_result[4] = SHA256M_E; - alt_result[5] = SHA256M_F; - alt_result[6] = SHA256M_G; - alt_result[7] = SHA256M_H; - - sha256_transform (block, alt_result); - - if (block_len >= 56) - { - sha256_transform (block + 16, alt_result); - } - } - - tmps[gid].alt_result[0] = alt_result[0]; - tmps[gid].alt_result[1] = alt_result[1]; - tmps[gid].alt_result[2] = alt_result[2]; - tmps[gid].alt_result[3] = alt_result[3]; - tmps[gid].alt_result[4] = alt_result[4]; - tmps[gid].alt_result[5] = alt_result[5]; - tmps[gid].alt_result[6] = alt_result[6]; - tmps[gid].alt_result[7] = alt_result[7]; -} - -__kernel void m07400_comp (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global sha256crypt_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * base - */ - - const u32 gid = get_global_id (0); - - if (gid >= gid_max) return; - - const u32 lid = get_local_id (0); - - const u32 r0 = swap32 (tmps[gid].alt_result[0]); - const u32 r1 = swap32 (tmps[gid].alt_result[1]); - const u32 r2 = swap32 (tmps[gid].alt_result[2]); - const u32 r3 = swap32 (tmps[gid].alt_result[3]); - - #define il_pos 0 - - #include COMPARE_M -} - -#endif diff --git a/OpenCL/m11400_a0-optimized.cl b/OpenCL/m11400_a0-optimized.cl index 58d7f4d5c..4e740b646 100644 --- a/OpenCL/m11400_a0-optimized.cl +++ b/OpenCL/m11400_a0-optimized.cl @@ -27,76 +27,79 @@ #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif -u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len) +u32 memcat32 (u32x block0[16], u32x block1[16], const u32 offset, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len) { - const u32 mod = block_len & 3; - const u32 div = block_len / 4; + const u32 mod = offset & 3; + const u32 div = offset / 4; #if defined IS_AMD || defined IS_GENERIC const int offset_minus_4 = 4 - mod; + u32x append00 = swap32 (append0[0]); + u32x append01 = swap32 (append0[1]); + u32x append02 = swap32 (append0[2]); + u32x append03 = swap32 (append0[3]); + u32x append10 = swap32 (append1[0]); + u32x append11 = swap32 (append1[1]); + u32x append12 = swap32 (append1[2]); + u32x append13 = swap32 (append1[3]); + u32x append20 = swap32 (append2[0]); + u32x append21 = swap32 (append2[1]); + u32x append22 = swap32 (append2[2]); + u32x append23 = swap32 (append2[3]); + u32x append30 = swap32 (append3[0]); + u32x append31 = swap32 (append3[1]); + u32x append32 = swap32 (append3[2]); + u32x append33 = swap32 (append3[3]); + u32x append0_t[4]; - - append0_t[0] = amd_bytealign (append0[0], 0, offset_minus_4); - append0_t[1] = amd_bytealign (append0[1], append0[0], offset_minus_4); - append0_t[2] = amd_bytealign (append0[2], append0[1], offset_minus_4); - append0_t[3] = amd_bytealign (append0[3], append0[2], offset_minus_4); - u32x append1_t[4]; - - append1_t[0] = amd_bytealign (append1[0], append0[3], offset_minus_4); - append1_t[1] = amd_bytealign (append1[1], append1[0], offset_minus_4); - append1_t[2] = amd_bytealign (append1[2], append1[1], offset_minus_4); - append1_t[3] = amd_bytealign (append1[3], append1[2], offset_minus_4); - u32x append2_t[4]; - - append2_t[0] = amd_bytealign (append2[0], append1[3], offset_minus_4); - append2_t[1] = amd_bytealign (append2[1], append2[0], offset_minus_4); - append2_t[2] = amd_bytealign (append2[2], append2[1], offset_minus_4); - append2_t[3] = amd_bytealign (append2[3], append2[2], offset_minus_4); - u32x append3_t[4]; - - append3_t[0] = amd_bytealign (append3[0], append2[3], offset_minus_4); - append3_t[1] = amd_bytealign (append3[1], append3[0], offset_minus_4); - append3_t[2] = amd_bytealign (append3[2], append3[1], offset_minus_4); - append3_t[3] = amd_bytealign (append3[3], append3[2], offset_minus_4); - u32x append4_t[4]; - append4_t[0] = amd_bytealign ( 0, append3[3], offset_minus_4); + append0_t[0] = amd_bytealign ( 0, append00, offset); + append0_t[1] = amd_bytealign (append00, append01, offset); + append0_t[2] = amd_bytealign (append01, append02, offset); + append0_t[3] = amd_bytealign (append02, append03, offset); + append1_t[0] = amd_bytealign (append03, append10, offset); + append1_t[1] = amd_bytealign (append10, append11, offset); + append1_t[2] = amd_bytealign (append11, append12, offset); + append1_t[3] = amd_bytealign (append12, append13, offset); + append2_t[0] = amd_bytealign (append13, append20, offset); + append2_t[1] = amd_bytealign (append20, append21, offset); + append2_t[2] = amd_bytealign (append21, append22, offset); + append2_t[3] = amd_bytealign (append22, append23, offset); + append3_t[0] = amd_bytealign (append23, append30, offset); + append3_t[1] = amd_bytealign (append30, append31, offset); + append3_t[2] = amd_bytealign (append31, append32, offset); + append3_t[3] = amd_bytealign (append32, append33, offset); + append4_t[0] = amd_bytealign (append33, 0, offset); append4_t[1] = 0; append4_t[2] = 0; append4_t[3] = 0; - if (mod == 0) - { - append0_t[0] = append0[0]; - append0_t[1] = append0[1]; - append0_t[2] = append0[2]; - append0_t[3] = append0[3]; + append0_t[0] = swap32 (append0_t[0]); + append0_t[1] = swap32 (append0_t[1]); + append0_t[2] = swap32 (append0_t[2]); + append0_t[3] = swap32 (append0_t[3]); + append1_t[0] = swap32 (append1_t[0]); + append1_t[1] = swap32 (append1_t[1]); + append1_t[2] = swap32 (append1_t[2]); + append1_t[3] = swap32 (append1_t[3]); + append2_t[0] = swap32 (append2_t[0]); + append2_t[1] = swap32 (append2_t[1]); + append2_t[2] = swap32 (append2_t[2]); + append2_t[3] = swap32 (append2_t[3]); + append3_t[0] = swap32 (append3_t[0]); + append3_t[1] = swap32 (append3_t[1]); + append3_t[2] = swap32 (append3_t[2]); + append3_t[3] = swap32 (append3_t[3]); + append4_t[0] = swap32 (append4_t[0]); + append4_t[1] = swap32 (append4_t[1]); + append4_t[2] = swap32 (append4_t[2]); + append4_t[3] = swap32 (append4_t[3]); - append1_t[0] = append1[0]; - append1_t[1] = append1[1]; - append1_t[2] = append1[2]; - append1_t[3] = append1[3]; - - append2_t[0] = append2[0]; - append2_t[1] = append2[1]; - append2_t[2] = append2[2]; - append2_t[3] = append2[3]; - - append3_t[0] = append3[0]; - append3_t[1] = append3[1]; - append3_t[2] = append3[2]; - append3_t[3] = append3[3]; - - append4_t[0] = 0; - append4_t[1] = 0; - append4_t[2] = 0; - append4_t[3] = 0; - } #endif #ifdef IS_NV @@ -105,40 +108,50 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + u32x append00 = append0[0]; + u32x append01 = append0[1]; + u32x append02 = append0[2]; + u32x append03 = append0[3]; + u32x append10 = append1[0]; + u32x append11 = append1[1]; + u32x append12 = append1[2]; + u32x append13 = append1[3]; + u32x append20 = append2[0]; + u32x append21 = append2[1]; + u32x append22 = append2[2]; + u32x append23 = append2[3]; + u32x append30 = append3[0]; + u32x append31 = append3[1]; + u32x append32 = append3[2]; + u32x append33 = append3[3]; + u32x append0_t[4]; - - append0_t[0] = __byte_perm ( 0, append0[0], selector); - append0_t[1] = __byte_perm (append0[0], append0[1], selector); - append0_t[2] = __byte_perm (append0[1], append0[2], selector); - append0_t[3] = __byte_perm (append0[2], append0[3], selector); - u32x append1_t[4]; - - append1_t[0] = __byte_perm (append0[3], append1[0], selector); - append1_t[1] = __byte_perm (append1[0], append1[1], selector); - append1_t[2] = __byte_perm (append1[1], append1[2], selector); - append1_t[3] = __byte_perm (append1[2], append1[3], selector); - u32x append2_t[4]; - - append2_t[0] = __byte_perm (append1[3], append2[0], selector); - append2_t[1] = __byte_perm (append2[0], append2[1], selector); - append2_t[2] = __byte_perm (append2[1], append2[2], selector); - append2_t[3] = __byte_perm (append2[2], append2[3], selector); - u32x append3_t[4]; - - append3_t[0] = __byte_perm (append2[3], append3[0], selector); - append3_t[1] = __byte_perm (append3[0], append3[1], selector); - append3_t[2] = __byte_perm (append3[1], append3[2], selector); - append3_t[3] = __byte_perm (append3[2], append3[3], selector); - u32x append4_t[4]; - append4_t[0] = __byte_perm (append3[3], 0, selector); + append0_t[0] = __byte_perm ( 0, append00, selector); + append0_t[1] = __byte_perm (append00, append01, selector); + append0_t[2] = __byte_perm (append01, append02, selector); + append0_t[3] = __byte_perm (append02, append03, selector); + append1_t[0] = __byte_perm (append03, append10, selector); + append1_t[1] = __byte_perm (append10, append11, selector); + append1_t[2] = __byte_perm (append11, append12, selector); + append1_t[3] = __byte_perm (append12, append13, selector); + append2_t[0] = __byte_perm (append13, append20, selector); + append2_t[1] = __byte_perm (append20, append21, selector); + append2_t[2] = __byte_perm (append21, append22, selector); + append2_t[3] = __byte_perm (append22, append23, selector); + append3_t[0] = __byte_perm (append23, append30, selector); + append3_t[1] = __byte_perm (append30, append31, selector); + append3_t[2] = __byte_perm (append31, append32, selector); + append3_t[3] = __byte_perm (append32, append33, selector); + append4_t[0] = __byte_perm (append33, 0, selector); append4_t[1] = 0; append4_t[2] = 0; append4_t[3] = 0; + #endif switch (div) @@ -147,22 +160,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 1] = append0_t[1]; block0[ 2] = append0_t[2]; block0[ 3] = append0_t[3]; - block0[ 4] = append1_t[0]; block0[ 5] = append1_t[1]; block0[ 6] = append1_t[2]; block0[ 7] = append1_t[3]; - block0[ 8] = append2_t[0]; block0[ 9] = append2_t[1]; block0[10] = append2_t[2]; block0[11] = append2_t[3]; - block0[12] = append3_t[0]; block0[13] = append3_t[1]; block0[14] = append3_t[2]; block0[15] = append3_t[3]; - block1[ 0] = append4_t[0]; block1[ 1] = append4_t[1]; block1[ 2] = append4_t[2]; @@ -173,22 +182,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 2] = append0_t[1]; block0[ 3] = append0_t[2]; block0[ 4] = append0_t[3]; - block0[ 5] = append1_t[0]; block0[ 6] = append1_t[1]; block0[ 7] = append1_t[2]; block0[ 8] = append1_t[3]; - block0[ 9] = append2_t[0]; block0[10] = append2_t[1]; block0[11] = append2_t[2]; block0[12] = append2_t[3]; - block0[13] = append3_t[0]; block0[14] = append3_t[1]; block0[15] = append3_t[2]; block1[ 0] = append3_t[3]; - block1[ 1] = append4_t[0]; block1[ 2] = append4_t[1]; block1[ 3] = append4_t[2]; @@ -199,22 +204,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 3] = append0_t[1]; block0[ 4] = append0_t[2]; block0[ 5] = append0_t[3]; - block0[ 6] = append1_t[0]; block0[ 7] = append1_t[1]; block0[ 8] = append1_t[2]; block0[ 9] = append1_t[3]; - block0[10] = append2_t[0]; block0[11] = append2_t[1]; block0[12] = append2_t[2]; block0[13] = append2_t[3]; - block0[14] = append3_t[0]; block0[15] = append3_t[1]; block1[ 0] = append3_t[2]; block1[ 1] = append3_t[3]; - block1[ 2] = append4_t[0]; block1[ 3] = append4_t[1]; block1[ 4] = append4_t[2]; @@ -225,22 +226,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 4] = append0_t[1]; block0[ 5] = append0_t[2]; block0[ 6] = append0_t[3]; - block0[ 7] = append1_t[0]; block0[ 8] = append1_t[1]; block0[ 9] = append1_t[2]; block0[10] = append1_t[3]; - block0[11] = append2_t[0]; block0[12] = append2_t[1]; block0[13] = append2_t[2]; block0[14] = append2_t[3]; - block0[15] = append3_t[0]; block1[ 0] = append3_t[1]; block1[ 1] = append3_t[2]; block1[ 2] = append3_t[3]; - block1[ 3] = append4_t[0]; block1[ 4] = append4_t[1]; block1[ 5] = append4_t[2]; @@ -251,22 +248,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 5] = append0_t[1]; block0[ 6] = append0_t[2]; block0[ 7] = append0_t[3]; - block0[ 8] = append1_t[0]; block0[ 9] = append1_t[1]; block0[10] = append1_t[2]; block0[11] = append1_t[3]; - block0[12] = append2_t[0]; block0[13] = append2_t[1]; block0[14] = append2_t[2]; block0[15] = append2_t[3]; - block1[ 0] = append3_t[0]; block1[ 1] = append3_t[1]; block1[ 2] = append3_t[2]; block1[ 3] = append3_t[3]; - block1[ 4] = append4_t[0]; block1[ 5] = append4_t[1]; block1[ 6] = append4_t[2]; @@ -277,22 +270,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 6] = append0_t[1]; block0[ 7] = append0_t[2]; block0[ 8] = append0_t[3]; - block0[ 9] = append1_t[0]; block0[10] = append1_t[1]; block0[11] = append1_t[2]; block0[12] = append1_t[3]; - block0[13] = append2_t[0]; block0[14] = append2_t[1]; block0[15] = append2_t[2]; block1[ 0] = append2_t[3]; - block1[ 1] = append3_t[0]; block1[ 2] = append3_t[1]; block1[ 3] = append3_t[2]; block1[ 4] = append3_t[3]; - block1[ 5] = append4_t[0]; block1[ 6] = append4_t[1]; block1[ 7] = append4_t[2]; @@ -303,22 +292,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 7] = append0_t[1]; block0[ 8] = append0_t[2]; block0[ 9] = append0_t[3]; - block0[10] = append1_t[0]; block0[11] = append1_t[1]; block0[12] = append1_t[2]; block0[13] = append1_t[3]; - block0[14] = append2_t[0]; block0[15] = append2_t[1]; block1[ 0] = append2_t[2]; block1[ 1] = append2_t[3]; - block1[ 2] = append3_t[0]; block1[ 3] = append3_t[1]; block1[ 4] = append3_t[2]; block1[ 5] = append3_t[3]; - block1[ 6] = append4_t[0]; block1[ 7] = append4_t[1]; block1[ 8] = append4_t[2]; @@ -329,22 +314,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 8] = append0_t[1]; block0[ 9] = append0_t[2]; block0[10] = append0_t[3]; - block0[11] = append1_t[0]; block0[12] = append1_t[1]; block0[13] = append1_t[2]; block0[14] = append1_t[3]; - block0[15] = append2_t[0]; block1[ 0] = append2_t[1]; block1[ 1] = append2_t[2]; block1[ 2] = append2_t[3]; - block1[ 3] = append3_t[0]; block1[ 4] = append3_t[1]; block1[ 5] = append3_t[2]; block1[ 6] = append3_t[3]; - block1[ 7] = append4_t[0]; block1[ 8] = append4_t[1]; block1[ 9] = append4_t[2]; @@ -355,22 +336,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 9] = append0_t[1]; block0[10] = append0_t[2]; block0[11] = append0_t[3]; - block0[12] = append1_t[0]; block0[13] = append1_t[1]; block0[14] = append1_t[2]; block0[15] = append1_t[3]; - block1[ 0] = append2_t[0]; block1[ 1] = append2_t[1]; block1[ 2] = append2_t[2]; block1[ 3] = append2_t[3]; - block1[ 4] = append3_t[0]; block1[ 5] = append3_t[1]; block1[ 6] = append3_t[2]; block1[ 7] = append3_t[3]; - block1[ 8] = append4_t[0]; block1[ 9] = append4_t[1]; block1[10] = append4_t[2]; @@ -381,22 +358,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[10] = append0_t[1]; block0[11] = append0_t[2]; block0[12] = append0_t[3]; - block0[13] = append1_t[0]; block0[14] = append1_t[1]; block0[15] = append1_t[2]; block1[ 0] = append1_t[3]; - block1[ 1] = append2_t[0]; block1[ 2] = append2_t[1]; block1[ 3] = append2_t[2]; block1[ 4] = append2_t[3]; - block1[ 5] = append3_t[0]; block1[ 6] = append3_t[1]; block1[ 7] = append3_t[2]; block1[ 8] = append3_t[3]; - block1[ 9] = append4_t[0]; block1[10] = append4_t[1]; block1[11] = append4_t[2]; @@ -407,22 +380,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[11] = append0_t[1]; block0[12] = append0_t[2]; block0[13] = append0_t[3]; - block0[14] = append1_t[0]; block0[15] = append1_t[1]; block1[ 0] = append1_t[2]; block1[ 1] = append1_t[3]; - block1[ 2] = append2_t[0]; block1[ 3] = append2_t[1]; block1[ 4] = append2_t[2]; block1[ 5] = append2_t[3]; - block1[ 6] = append3_t[0]; block1[ 7] = append3_t[1]; block1[ 8] = append3_t[2]; block1[ 9] = append3_t[3]; - block1[10] = append4_t[0]; block1[11] = append4_t[1]; block1[12] = append4_t[2]; @@ -433,22 +402,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[12] = append0_t[1]; block0[13] = append0_t[2]; block0[14] = append0_t[3]; - block0[15] = append1_t[0]; block1[ 0] = append1_t[1]; block1[ 1] = append1_t[2]; block1[ 2] = append1_t[3]; - block1[ 3] = append2_t[0]; block1[ 4] = append2_t[1]; block1[ 5] = append2_t[2]; block1[ 6] = append2_t[3]; - block1[ 7] = append3_t[0]; block1[ 8] = append3_t[1]; block1[ 9] = append3_t[2]; block1[10] = append3_t[3]; - block1[11] = append4_t[0]; block1[12] = append4_t[1]; block1[13] = append4_t[2]; @@ -459,22 +424,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[13] = append0_t[1]; block0[14] = append0_t[2]; block0[15] = append0_t[3]; - block1[ 0] = append1_t[0]; block1[ 1] = append1_t[1]; block1[ 2] = append1_t[2]; block1[ 3] = append1_t[3]; - block1[ 4] = append2_t[0]; block1[ 5] = append2_t[1]; block1[ 6] = append2_t[2]; block1[ 7] = append2_t[3]; - block1[ 8] = append3_t[0]; block1[ 9] = append3_t[1]; block1[10] = append3_t[2]; block1[11] = append3_t[3]; - block1[12] = append4_t[0]; block1[13] = append4_t[1]; block1[14] = append4_t[2]; @@ -485,22 +446,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[14] = append0_t[1]; block0[15] = append0_t[2]; block1[ 0] = append0_t[3]; - block1[ 1] = append1_t[0]; block1[ 2] = append1_t[1]; block1[ 3] = append1_t[2]; block1[ 4] = append1_t[3]; - block1[ 5] = append2_t[0]; block1[ 6] = append2_t[1]; block1[ 7] = append2_t[2]; block1[ 8] = append2_t[3]; - block1[ 9] = append3_t[0]; block1[10] = append3_t[1]; block1[11] = append3_t[2]; block1[12] = append3_t[3]; - block1[13] = append4_t[0]; block1[14] = append4_t[1]; block1[15] = append4_t[2]; @@ -510,22 +467,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[15] = append0_t[1]; block1[ 0] = append0_t[2]; block1[ 1] = append0_t[3]; - block1[ 2] = append1_t[0]; block1[ 3] = append1_t[1]; block1[ 4] = append1_t[2]; block1[ 5] = append1_t[3]; - block1[ 6] = append2_t[0]; block1[ 7] = append2_t[1]; block1[ 8] = append2_t[2]; block1[ 9] = append2_t[3]; - block1[10] = append3_t[0]; block1[11] = append3_t[1]; block1[12] = append3_t[2]; block1[13] = append3_t[3]; - block1[14] = append4_t[0]; block1[15] = append4_t[1]; break; @@ -534,22 +487,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 0] = append0_t[1]; block1[ 1] = append0_t[2]; block1[ 2] = append0_t[3]; - block1[ 3] = append1_t[1]; block1[ 4] = append1_t[2]; block1[ 5] = append1_t[3]; block1[ 6] = append1_t[0]; - block1[ 7] = append2_t[0]; block1[ 8] = append2_t[1]; block1[ 9] = append2_t[2]; block1[10] = append2_t[3]; - block1[11] = append3_t[0]; block1[12] = append3_t[1]; block1[13] = append3_t[2]; block1[14] = append3_t[3]; - block1[15] = append4_t[0]; break; @@ -557,17 +506,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 1] = append0_t[1]; block1[ 2] = append0_t[2]; block1[ 3] = append0_t[3]; - block1[ 4] = append1_t[0]; block1[ 5] = append1_t[1]; block1[ 6] = append1_t[2]; block1[ 7] = append1_t[3]; - block1[ 8] = append2_t[0]; block1[ 9] = append2_t[1]; block1[10] = append2_t[2]; block1[11] = append2_t[3]; - block1[12] = append3_t[0]; block1[13] = append3_t[1]; block1[14] = append3_t[2]; @@ -578,17 +524,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 2] = append0_t[1]; block1[ 3] = append0_t[2]; block1[ 4] = append0_t[3]; - block1[ 5] = append1_t[0]; block1[ 6] = append1_t[1]; block1[ 7] = append1_t[2]; block1[ 8] = append1_t[3]; - block1[ 9] = append2_t[0]; block1[10] = append2_t[1]; block1[11] = append2_t[2]; block1[12] = append2_t[3]; - block1[13] = append3_t[0]; block1[14] = append3_t[1]; block1[15] = append3_t[2]; @@ -598,17 +541,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 3] = append0_t[1]; block1[ 4] = append0_t[2]; block1[ 5] = append0_t[3]; - block1[ 6] = append1_t[0]; block1[ 7] = append1_t[1]; block1[ 8] = append1_t[2]; block1[ 9] = append1_t[3]; - block1[10] = append2_t[0]; block1[11] = append2_t[1]; block1[12] = append2_t[2]; block1[13] = append2_t[3]; - block1[14] = append3_t[0]; block1[15] = append3_t[1]; break; @@ -617,17 +557,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 4] = append0_t[1]; block1[ 5] = append0_t[2]; block1[ 6] = append0_t[3]; - block1[ 7] = append1_t[0]; block1[ 8] = append1_t[1]; block1[ 9] = append1_t[2]; block1[10] = append1_t[3]; - block1[11] = append2_t[0]; block1[12] = append2_t[1]; block1[13] = append2_t[2]; block1[14] = append2_t[3]; - block1[15] = append3_t[0]; break; @@ -635,12 +572,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 5] = append0_t[1]; block1[ 6] = append0_t[2]; block1[ 7] = append0_t[3]; - block1[ 8] = append1_t[0]; block1[ 9] = append1_t[1]; block1[10] = append1_t[2]; block1[11] = append1_t[3]; - block1[12] = append2_t[0]; block1[13] = append2_t[1]; block1[14] = append2_t[2]; @@ -651,12 +586,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 6] = append0_t[1]; block1[ 7] = append0_t[2]; block1[ 8] = append0_t[3]; - block1[ 9] = append1_t[0]; block1[10] = append1_t[1]; block1[11] = append1_t[2]; block1[12] = append1_t[3]; - block1[13] = append2_t[0]; block1[14] = append2_t[1]; block1[15] = append2_t[2]; @@ -666,12 +599,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 7] = append0_t[1]; block1[ 8] = append0_t[2]; block1[ 9] = append0_t[3]; - block1[10] = append1_t[0]; block1[11] = append1_t[1]; block1[12] = append1_t[2]; block1[13] = append1_t[3]; - block1[14] = append2_t[0]; block1[15] = append2_t[1]; break; @@ -680,12 +611,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 8] = append0_t[1]; block1[ 9] = append0_t[2]; block1[10] = append0_t[3]; - block1[11] = append1_t[0]; block1[12] = append1_t[1]; block1[13] = append1_t[2]; block1[14] = append1_t[3]; - block1[15] = append2_t[0]; break; @@ -693,7 +622,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 9] = append0_t[1]; block1[10] = append0_t[2]; block1[11] = append0_t[3]; - block1[12] = append1_t[0]; block1[13] = append1_t[1]; block1[14] = append1_t[2]; @@ -704,7 +632,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[10] = append0_t[1]; block1[11] = append0_t[2]; block1[12] = append0_t[3]; - block1[13] = append1_t[0]; block1[14] = append1_t[1]; block1[15] = append1_t[2]; @@ -714,7 +641,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[11] = append0_t[1]; block1[12] = append0_t[2]; block1[13] = append0_t[3]; - block1[14] = append1_t[0]; block1[15] = append1_t[1]; break; @@ -723,7 +649,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[12] = append0_t[1]; block1[13] = append0_t[2]; block1[14] = append0_t[3]; - block1[15] = append1_t[0]; break; @@ -743,7 +668,7 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x break; } - u32 new_len = block_len + append_len; + u32 new_len = offset + append_len; return new_len; } diff --git a/OpenCL/m11400_a1-optimized.cl b/OpenCL/m11400_a1-optimized.cl index 4fc47020e..4df896258 100644 --- a/OpenCL/m11400_a1-optimized.cl +++ b/OpenCL/m11400_a1-optimized.cl @@ -25,76 +25,79 @@ #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif -u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len) +u32 memcat32 (u32x block0[16], u32x block1[16], const u32 offset, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len) { - const u32 mod = block_len & 3; - const u32 div = block_len / 4; + const u32 mod = offset & 3; + const u32 div = offset / 4; #if defined IS_AMD || defined IS_GENERIC const int offset_minus_4 = 4 - mod; + u32x append00 = swap32 (append0[0]); + u32x append01 = swap32 (append0[1]); + u32x append02 = swap32 (append0[2]); + u32x append03 = swap32 (append0[3]); + u32x append10 = swap32 (append1[0]); + u32x append11 = swap32 (append1[1]); + u32x append12 = swap32 (append1[2]); + u32x append13 = swap32 (append1[3]); + u32x append20 = swap32 (append2[0]); + u32x append21 = swap32 (append2[1]); + u32x append22 = swap32 (append2[2]); + u32x append23 = swap32 (append2[3]); + u32x append30 = swap32 (append3[0]); + u32x append31 = swap32 (append3[1]); + u32x append32 = swap32 (append3[2]); + u32x append33 = swap32 (append3[3]); + u32x append0_t[4]; - - append0_t[0] = amd_bytealign (append0[0], 0, offset_minus_4); - append0_t[1] = amd_bytealign (append0[1], append0[0], offset_minus_4); - append0_t[2] = amd_bytealign (append0[2], append0[1], offset_minus_4); - append0_t[3] = amd_bytealign (append0[3], append0[2], offset_minus_4); - u32x append1_t[4]; - - append1_t[0] = amd_bytealign (append1[0], append0[3], offset_minus_4); - append1_t[1] = amd_bytealign (append1[1], append1[0], offset_minus_4); - append1_t[2] = amd_bytealign (append1[2], append1[1], offset_minus_4); - append1_t[3] = amd_bytealign (append1[3], append1[2], offset_minus_4); - u32x append2_t[4]; - - append2_t[0] = amd_bytealign (append2[0], append1[3], offset_minus_4); - append2_t[1] = amd_bytealign (append2[1], append2[0], offset_minus_4); - append2_t[2] = amd_bytealign (append2[2], append2[1], offset_minus_4); - append2_t[3] = amd_bytealign (append2[3], append2[2], offset_minus_4); - u32x append3_t[4]; - - append3_t[0] = amd_bytealign (append3[0], append2[3], offset_minus_4); - append3_t[1] = amd_bytealign (append3[1], append3[0], offset_minus_4); - append3_t[2] = amd_bytealign (append3[2], append3[1], offset_minus_4); - append3_t[3] = amd_bytealign (append3[3], append3[2], offset_minus_4); - u32x append4_t[4]; - append4_t[0] = amd_bytealign ( 0, append3[3], offset_minus_4); + append0_t[0] = amd_bytealign ( 0, append00, offset); + append0_t[1] = amd_bytealign (append00, append01, offset); + append0_t[2] = amd_bytealign (append01, append02, offset); + append0_t[3] = amd_bytealign (append02, append03, offset); + append1_t[0] = amd_bytealign (append03, append10, offset); + append1_t[1] = amd_bytealign (append10, append11, offset); + append1_t[2] = amd_bytealign (append11, append12, offset); + append1_t[3] = amd_bytealign (append12, append13, offset); + append2_t[0] = amd_bytealign (append13, append20, offset); + append2_t[1] = amd_bytealign (append20, append21, offset); + append2_t[2] = amd_bytealign (append21, append22, offset); + append2_t[3] = amd_bytealign (append22, append23, offset); + append3_t[0] = amd_bytealign (append23, append30, offset); + append3_t[1] = amd_bytealign (append30, append31, offset); + append3_t[2] = amd_bytealign (append31, append32, offset); + append3_t[3] = amd_bytealign (append32, append33, offset); + append4_t[0] = amd_bytealign (append33, 0, offset); append4_t[1] = 0; append4_t[2] = 0; append4_t[3] = 0; - if (mod == 0) - { - append0_t[0] = append0[0]; - append0_t[1] = append0[1]; - append0_t[2] = append0[2]; - append0_t[3] = append0[3]; + append0_t[0] = swap32 (append0_t[0]); + append0_t[1] = swap32 (append0_t[1]); + append0_t[2] = swap32 (append0_t[2]); + append0_t[3] = swap32 (append0_t[3]); + append1_t[0] = swap32 (append1_t[0]); + append1_t[1] = swap32 (append1_t[1]); + append1_t[2] = swap32 (append1_t[2]); + append1_t[3] = swap32 (append1_t[3]); + append2_t[0] = swap32 (append2_t[0]); + append2_t[1] = swap32 (append2_t[1]); + append2_t[2] = swap32 (append2_t[2]); + append2_t[3] = swap32 (append2_t[3]); + append3_t[0] = swap32 (append3_t[0]); + append3_t[1] = swap32 (append3_t[1]); + append3_t[2] = swap32 (append3_t[2]); + append3_t[3] = swap32 (append3_t[3]); + append4_t[0] = swap32 (append4_t[0]); + append4_t[1] = swap32 (append4_t[1]); + append4_t[2] = swap32 (append4_t[2]); + append4_t[3] = swap32 (append4_t[3]); - append1_t[0] = append1[0]; - append1_t[1] = append1[1]; - append1_t[2] = append1[2]; - append1_t[3] = append1[3]; - - append2_t[0] = append2[0]; - append2_t[1] = append2[1]; - append2_t[2] = append2[2]; - append2_t[3] = append2[3]; - - append3_t[0] = append3[0]; - append3_t[1] = append3[1]; - append3_t[2] = append3[2]; - append3_t[3] = append3[3]; - - append4_t[0] = 0; - append4_t[1] = 0; - append4_t[2] = 0; - append4_t[3] = 0; - } #endif #ifdef IS_NV @@ -103,40 +106,50 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + u32x append00 = append0[0]; + u32x append01 = append0[1]; + u32x append02 = append0[2]; + u32x append03 = append0[3]; + u32x append10 = append1[0]; + u32x append11 = append1[1]; + u32x append12 = append1[2]; + u32x append13 = append1[3]; + u32x append20 = append2[0]; + u32x append21 = append2[1]; + u32x append22 = append2[2]; + u32x append23 = append2[3]; + u32x append30 = append3[0]; + u32x append31 = append3[1]; + u32x append32 = append3[2]; + u32x append33 = append3[3]; + u32x append0_t[4]; - - append0_t[0] = __byte_perm ( 0, append0[0], selector); - append0_t[1] = __byte_perm (append0[0], append0[1], selector); - append0_t[2] = __byte_perm (append0[1], append0[2], selector); - append0_t[3] = __byte_perm (append0[2], append0[3], selector); - u32x append1_t[4]; - - append1_t[0] = __byte_perm (append0[3], append1[0], selector); - append1_t[1] = __byte_perm (append1[0], append1[1], selector); - append1_t[2] = __byte_perm (append1[1], append1[2], selector); - append1_t[3] = __byte_perm (append1[2], append1[3], selector); - u32x append2_t[4]; - - append2_t[0] = __byte_perm (append1[3], append2[0], selector); - append2_t[1] = __byte_perm (append2[0], append2[1], selector); - append2_t[2] = __byte_perm (append2[1], append2[2], selector); - append2_t[3] = __byte_perm (append2[2], append2[3], selector); - u32x append3_t[4]; - - append3_t[0] = __byte_perm (append2[3], append3[0], selector); - append3_t[1] = __byte_perm (append3[0], append3[1], selector); - append3_t[2] = __byte_perm (append3[1], append3[2], selector); - append3_t[3] = __byte_perm (append3[2], append3[3], selector); - u32x append4_t[4]; - append4_t[0] = __byte_perm (append3[3], 0, selector); + append0_t[0] = __byte_perm ( 0, append00, selector); + append0_t[1] = __byte_perm (append00, append01, selector); + append0_t[2] = __byte_perm (append01, append02, selector); + append0_t[3] = __byte_perm (append02, append03, selector); + append1_t[0] = __byte_perm (append03, append10, selector); + append1_t[1] = __byte_perm (append10, append11, selector); + append1_t[2] = __byte_perm (append11, append12, selector); + append1_t[3] = __byte_perm (append12, append13, selector); + append2_t[0] = __byte_perm (append13, append20, selector); + append2_t[1] = __byte_perm (append20, append21, selector); + append2_t[2] = __byte_perm (append21, append22, selector); + append2_t[3] = __byte_perm (append22, append23, selector); + append3_t[0] = __byte_perm (append23, append30, selector); + append3_t[1] = __byte_perm (append30, append31, selector); + append3_t[2] = __byte_perm (append31, append32, selector); + append3_t[3] = __byte_perm (append32, append33, selector); + append4_t[0] = __byte_perm (append33, 0, selector); append4_t[1] = 0; append4_t[2] = 0; append4_t[3] = 0; + #endif switch (div) @@ -145,22 +158,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 1] = append0_t[1]; block0[ 2] = append0_t[2]; block0[ 3] = append0_t[3]; - block0[ 4] = append1_t[0]; block0[ 5] = append1_t[1]; block0[ 6] = append1_t[2]; block0[ 7] = append1_t[3]; - block0[ 8] = append2_t[0]; block0[ 9] = append2_t[1]; block0[10] = append2_t[2]; block0[11] = append2_t[3]; - block0[12] = append3_t[0]; block0[13] = append3_t[1]; block0[14] = append3_t[2]; block0[15] = append3_t[3]; - block1[ 0] = append4_t[0]; block1[ 1] = append4_t[1]; block1[ 2] = append4_t[2]; @@ -171,22 +180,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 2] = append0_t[1]; block0[ 3] = append0_t[2]; block0[ 4] = append0_t[3]; - block0[ 5] = append1_t[0]; block0[ 6] = append1_t[1]; block0[ 7] = append1_t[2]; block0[ 8] = append1_t[3]; - block0[ 9] = append2_t[0]; block0[10] = append2_t[1]; block0[11] = append2_t[2]; block0[12] = append2_t[3]; - block0[13] = append3_t[0]; block0[14] = append3_t[1]; block0[15] = append3_t[2]; block1[ 0] = append3_t[3]; - block1[ 1] = append4_t[0]; block1[ 2] = append4_t[1]; block1[ 3] = append4_t[2]; @@ -197,22 +202,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 3] = append0_t[1]; block0[ 4] = append0_t[2]; block0[ 5] = append0_t[3]; - block0[ 6] = append1_t[0]; block0[ 7] = append1_t[1]; block0[ 8] = append1_t[2]; block0[ 9] = append1_t[3]; - block0[10] = append2_t[0]; block0[11] = append2_t[1]; block0[12] = append2_t[2]; block0[13] = append2_t[3]; - block0[14] = append3_t[0]; block0[15] = append3_t[1]; block1[ 0] = append3_t[2]; block1[ 1] = append3_t[3]; - block1[ 2] = append4_t[0]; block1[ 3] = append4_t[1]; block1[ 4] = append4_t[2]; @@ -223,22 +224,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 4] = append0_t[1]; block0[ 5] = append0_t[2]; block0[ 6] = append0_t[3]; - block0[ 7] = append1_t[0]; block0[ 8] = append1_t[1]; block0[ 9] = append1_t[2]; block0[10] = append1_t[3]; - block0[11] = append2_t[0]; block0[12] = append2_t[1]; block0[13] = append2_t[2]; block0[14] = append2_t[3]; - block0[15] = append3_t[0]; block1[ 0] = append3_t[1]; block1[ 1] = append3_t[2]; block1[ 2] = append3_t[3]; - block1[ 3] = append4_t[0]; block1[ 4] = append4_t[1]; block1[ 5] = append4_t[2]; @@ -249,22 +246,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 5] = append0_t[1]; block0[ 6] = append0_t[2]; block0[ 7] = append0_t[3]; - block0[ 8] = append1_t[0]; block0[ 9] = append1_t[1]; block0[10] = append1_t[2]; block0[11] = append1_t[3]; - block0[12] = append2_t[0]; block0[13] = append2_t[1]; block0[14] = append2_t[2]; block0[15] = append2_t[3]; - block1[ 0] = append3_t[0]; block1[ 1] = append3_t[1]; block1[ 2] = append3_t[2]; block1[ 3] = append3_t[3]; - block1[ 4] = append4_t[0]; block1[ 5] = append4_t[1]; block1[ 6] = append4_t[2]; @@ -275,22 +268,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 6] = append0_t[1]; block0[ 7] = append0_t[2]; block0[ 8] = append0_t[3]; - block0[ 9] = append1_t[0]; block0[10] = append1_t[1]; block0[11] = append1_t[2]; block0[12] = append1_t[3]; - block0[13] = append2_t[0]; block0[14] = append2_t[1]; block0[15] = append2_t[2]; block1[ 0] = append2_t[3]; - block1[ 1] = append3_t[0]; block1[ 2] = append3_t[1]; block1[ 3] = append3_t[2]; block1[ 4] = append3_t[3]; - block1[ 5] = append4_t[0]; block1[ 6] = append4_t[1]; block1[ 7] = append4_t[2]; @@ -301,22 +290,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 7] = append0_t[1]; block0[ 8] = append0_t[2]; block0[ 9] = append0_t[3]; - block0[10] = append1_t[0]; block0[11] = append1_t[1]; block0[12] = append1_t[2]; block0[13] = append1_t[3]; - block0[14] = append2_t[0]; block0[15] = append2_t[1]; block1[ 0] = append2_t[2]; block1[ 1] = append2_t[3]; - block1[ 2] = append3_t[0]; block1[ 3] = append3_t[1]; block1[ 4] = append3_t[2]; block1[ 5] = append3_t[3]; - block1[ 6] = append4_t[0]; block1[ 7] = append4_t[1]; block1[ 8] = append4_t[2]; @@ -327,22 +312,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 8] = append0_t[1]; block0[ 9] = append0_t[2]; block0[10] = append0_t[3]; - block0[11] = append1_t[0]; block0[12] = append1_t[1]; block0[13] = append1_t[2]; block0[14] = append1_t[3]; - block0[15] = append2_t[0]; block1[ 0] = append2_t[1]; block1[ 1] = append2_t[2]; block1[ 2] = append2_t[3]; - block1[ 3] = append3_t[0]; block1[ 4] = append3_t[1]; block1[ 5] = append3_t[2]; block1[ 6] = append3_t[3]; - block1[ 7] = append4_t[0]; block1[ 8] = append4_t[1]; block1[ 9] = append4_t[2]; @@ -353,22 +334,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 9] = append0_t[1]; block0[10] = append0_t[2]; block0[11] = append0_t[3]; - block0[12] = append1_t[0]; block0[13] = append1_t[1]; block0[14] = append1_t[2]; block0[15] = append1_t[3]; - block1[ 0] = append2_t[0]; block1[ 1] = append2_t[1]; block1[ 2] = append2_t[2]; block1[ 3] = append2_t[3]; - block1[ 4] = append3_t[0]; block1[ 5] = append3_t[1]; block1[ 6] = append3_t[2]; block1[ 7] = append3_t[3]; - block1[ 8] = append4_t[0]; block1[ 9] = append4_t[1]; block1[10] = append4_t[2]; @@ -379,22 +356,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[10] = append0_t[1]; block0[11] = append0_t[2]; block0[12] = append0_t[3]; - block0[13] = append1_t[0]; block0[14] = append1_t[1]; block0[15] = append1_t[2]; block1[ 0] = append1_t[3]; - block1[ 1] = append2_t[0]; block1[ 2] = append2_t[1]; block1[ 3] = append2_t[2]; block1[ 4] = append2_t[3]; - block1[ 5] = append3_t[0]; block1[ 6] = append3_t[1]; block1[ 7] = append3_t[2]; block1[ 8] = append3_t[3]; - block1[ 9] = append4_t[0]; block1[10] = append4_t[1]; block1[11] = append4_t[2]; @@ -405,22 +378,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[11] = append0_t[1]; block0[12] = append0_t[2]; block0[13] = append0_t[3]; - block0[14] = append1_t[0]; block0[15] = append1_t[1]; block1[ 0] = append1_t[2]; block1[ 1] = append1_t[3]; - block1[ 2] = append2_t[0]; block1[ 3] = append2_t[1]; block1[ 4] = append2_t[2]; block1[ 5] = append2_t[3]; - block1[ 6] = append3_t[0]; block1[ 7] = append3_t[1]; block1[ 8] = append3_t[2]; block1[ 9] = append3_t[3]; - block1[10] = append4_t[0]; block1[11] = append4_t[1]; block1[12] = append4_t[2]; @@ -431,22 +400,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[12] = append0_t[1]; block0[13] = append0_t[2]; block0[14] = append0_t[3]; - block0[15] = append1_t[0]; block1[ 0] = append1_t[1]; block1[ 1] = append1_t[2]; block1[ 2] = append1_t[3]; - block1[ 3] = append2_t[0]; block1[ 4] = append2_t[1]; block1[ 5] = append2_t[2]; block1[ 6] = append2_t[3]; - block1[ 7] = append3_t[0]; block1[ 8] = append3_t[1]; block1[ 9] = append3_t[2]; block1[10] = append3_t[3]; - block1[11] = append4_t[0]; block1[12] = append4_t[1]; block1[13] = append4_t[2]; @@ -457,22 +422,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[13] = append0_t[1]; block0[14] = append0_t[2]; block0[15] = append0_t[3]; - block1[ 0] = append1_t[0]; block1[ 1] = append1_t[1]; block1[ 2] = append1_t[2]; block1[ 3] = append1_t[3]; - block1[ 4] = append2_t[0]; block1[ 5] = append2_t[1]; block1[ 6] = append2_t[2]; block1[ 7] = append2_t[3]; - block1[ 8] = append3_t[0]; block1[ 9] = append3_t[1]; block1[10] = append3_t[2]; block1[11] = append3_t[3]; - block1[12] = append4_t[0]; block1[13] = append4_t[1]; block1[14] = append4_t[2]; @@ -483,22 +444,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[14] = append0_t[1]; block0[15] = append0_t[2]; block1[ 0] = append0_t[3]; - block1[ 1] = append1_t[0]; block1[ 2] = append1_t[1]; block1[ 3] = append1_t[2]; block1[ 4] = append1_t[3]; - block1[ 5] = append2_t[0]; block1[ 6] = append2_t[1]; block1[ 7] = append2_t[2]; block1[ 8] = append2_t[3]; - block1[ 9] = append3_t[0]; block1[10] = append3_t[1]; block1[11] = append3_t[2]; block1[12] = append3_t[3]; - block1[13] = append4_t[0]; block1[14] = append4_t[1]; block1[15] = append4_t[2]; @@ -508,22 +465,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[15] = append0_t[1]; block1[ 0] = append0_t[2]; block1[ 1] = append0_t[3]; - block1[ 2] = append1_t[0]; block1[ 3] = append1_t[1]; block1[ 4] = append1_t[2]; block1[ 5] = append1_t[3]; - block1[ 6] = append2_t[0]; block1[ 7] = append2_t[1]; block1[ 8] = append2_t[2]; block1[ 9] = append2_t[3]; - block1[10] = append3_t[0]; block1[11] = append3_t[1]; block1[12] = append3_t[2]; block1[13] = append3_t[3]; - block1[14] = append4_t[0]; block1[15] = append4_t[1]; break; @@ -532,22 +485,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 0] = append0_t[1]; block1[ 1] = append0_t[2]; block1[ 2] = append0_t[3]; - block1[ 3] = append1_t[1]; block1[ 4] = append1_t[2]; block1[ 5] = append1_t[3]; block1[ 6] = append1_t[0]; - block1[ 7] = append2_t[0]; block1[ 8] = append2_t[1]; block1[ 9] = append2_t[2]; block1[10] = append2_t[3]; - block1[11] = append3_t[0]; block1[12] = append3_t[1]; block1[13] = append3_t[2]; block1[14] = append3_t[3]; - block1[15] = append4_t[0]; break; @@ -555,17 +504,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 1] = append0_t[1]; block1[ 2] = append0_t[2]; block1[ 3] = append0_t[3]; - block1[ 4] = append1_t[0]; block1[ 5] = append1_t[1]; block1[ 6] = append1_t[2]; block1[ 7] = append1_t[3]; - block1[ 8] = append2_t[0]; block1[ 9] = append2_t[1]; block1[10] = append2_t[2]; block1[11] = append2_t[3]; - block1[12] = append3_t[0]; block1[13] = append3_t[1]; block1[14] = append3_t[2]; @@ -576,17 +522,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 2] = append0_t[1]; block1[ 3] = append0_t[2]; block1[ 4] = append0_t[3]; - block1[ 5] = append1_t[0]; block1[ 6] = append1_t[1]; block1[ 7] = append1_t[2]; block1[ 8] = append1_t[3]; - block1[ 9] = append2_t[0]; block1[10] = append2_t[1]; block1[11] = append2_t[2]; block1[12] = append2_t[3]; - block1[13] = append3_t[0]; block1[14] = append3_t[1]; block1[15] = append3_t[2]; @@ -596,17 +539,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 3] = append0_t[1]; block1[ 4] = append0_t[2]; block1[ 5] = append0_t[3]; - block1[ 6] = append1_t[0]; block1[ 7] = append1_t[1]; block1[ 8] = append1_t[2]; block1[ 9] = append1_t[3]; - block1[10] = append2_t[0]; block1[11] = append2_t[1]; block1[12] = append2_t[2]; block1[13] = append2_t[3]; - block1[14] = append3_t[0]; block1[15] = append3_t[1]; break; @@ -615,17 +555,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 4] = append0_t[1]; block1[ 5] = append0_t[2]; block1[ 6] = append0_t[3]; - block1[ 7] = append1_t[0]; block1[ 8] = append1_t[1]; block1[ 9] = append1_t[2]; block1[10] = append1_t[3]; - block1[11] = append2_t[0]; block1[12] = append2_t[1]; block1[13] = append2_t[2]; block1[14] = append2_t[3]; - block1[15] = append3_t[0]; break; @@ -633,12 +570,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 5] = append0_t[1]; block1[ 6] = append0_t[2]; block1[ 7] = append0_t[3]; - block1[ 8] = append1_t[0]; block1[ 9] = append1_t[1]; block1[10] = append1_t[2]; block1[11] = append1_t[3]; - block1[12] = append2_t[0]; block1[13] = append2_t[1]; block1[14] = append2_t[2]; @@ -649,12 +584,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 6] = append0_t[1]; block1[ 7] = append0_t[2]; block1[ 8] = append0_t[3]; - block1[ 9] = append1_t[0]; block1[10] = append1_t[1]; block1[11] = append1_t[2]; block1[12] = append1_t[3]; - block1[13] = append2_t[0]; block1[14] = append2_t[1]; block1[15] = append2_t[2]; @@ -664,12 +597,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 7] = append0_t[1]; block1[ 8] = append0_t[2]; block1[ 9] = append0_t[3]; - block1[10] = append1_t[0]; block1[11] = append1_t[1]; block1[12] = append1_t[2]; block1[13] = append1_t[3]; - block1[14] = append2_t[0]; block1[15] = append2_t[1]; break; @@ -678,12 +609,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 8] = append0_t[1]; block1[ 9] = append0_t[2]; block1[10] = append0_t[3]; - block1[11] = append1_t[0]; block1[12] = append1_t[1]; block1[13] = append1_t[2]; block1[14] = append1_t[3]; - block1[15] = append2_t[0]; break; @@ -691,7 +620,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 9] = append0_t[1]; block1[10] = append0_t[2]; block1[11] = append0_t[3]; - block1[12] = append1_t[0]; block1[13] = append1_t[1]; block1[14] = append1_t[2]; @@ -702,7 +630,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[10] = append0_t[1]; block1[11] = append0_t[2]; block1[12] = append0_t[3]; - block1[13] = append1_t[0]; block1[14] = append1_t[1]; block1[15] = append1_t[2]; @@ -712,7 +639,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[11] = append0_t[1]; block1[12] = append0_t[2]; block1[13] = append0_t[3]; - block1[14] = append1_t[0]; block1[15] = append1_t[1]; break; @@ -721,7 +647,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[12] = append0_t[1]; block1[13] = append0_t[2]; block1[14] = append0_t[3]; - block1[15] = append1_t[0]; break; @@ -741,7 +666,7 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x break; } - u32 new_len = block_len + append_len; + u32 new_len = offset + append_len; return new_len; } diff --git a/OpenCL/m11400_a3-optimized.cl b/OpenCL/m11400_a3-optimized.cl index efcffb31e..4e883375d 100644 --- a/OpenCL/m11400_a3-optimized.cl +++ b/OpenCL/m11400_a3-optimized.cl @@ -25,76 +25,79 @@ #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif -u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len) +u32 memcat32 (u32x block0[16], u32x block1[16], const u32 offset, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len) { - const u32 mod = block_len & 3; - const u32 div = block_len / 4; + const u32 mod = offset & 3; + const u32 div = offset / 4; #if defined IS_AMD || defined IS_GENERIC const int offset_minus_4 = 4 - mod; + u32x append00 = swap32 (append0[0]); + u32x append01 = swap32 (append0[1]); + u32x append02 = swap32 (append0[2]); + u32x append03 = swap32 (append0[3]); + u32x append10 = swap32 (append1[0]); + u32x append11 = swap32 (append1[1]); + u32x append12 = swap32 (append1[2]); + u32x append13 = swap32 (append1[3]); + u32x append20 = swap32 (append2[0]); + u32x append21 = swap32 (append2[1]); + u32x append22 = swap32 (append2[2]); + u32x append23 = swap32 (append2[3]); + u32x append30 = swap32 (append3[0]); + u32x append31 = swap32 (append3[1]); + u32x append32 = swap32 (append3[2]); + u32x append33 = swap32 (append3[3]); + u32x append0_t[4]; - - append0_t[0] = amd_bytealign (append0[0], 0, offset_minus_4); - append0_t[1] = amd_bytealign (append0[1], append0[0], offset_minus_4); - append0_t[2] = amd_bytealign (append0[2], append0[1], offset_minus_4); - append0_t[3] = amd_bytealign (append0[3], append0[2], offset_minus_4); - u32x append1_t[4]; - - append1_t[0] = amd_bytealign (append1[0], append0[3], offset_minus_4); - append1_t[1] = amd_bytealign (append1[1], append1[0], offset_minus_4); - append1_t[2] = amd_bytealign (append1[2], append1[1], offset_minus_4); - append1_t[3] = amd_bytealign (append1[3], append1[2], offset_minus_4); - u32x append2_t[4]; - - append2_t[0] = amd_bytealign (append2[0], append1[3], offset_minus_4); - append2_t[1] = amd_bytealign (append2[1], append2[0], offset_minus_4); - append2_t[2] = amd_bytealign (append2[2], append2[1], offset_minus_4); - append2_t[3] = amd_bytealign (append2[3], append2[2], offset_minus_4); - u32x append3_t[4]; - - append3_t[0] = amd_bytealign (append3[0], append2[3], offset_minus_4); - append3_t[1] = amd_bytealign (append3[1], append3[0], offset_minus_4); - append3_t[2] = amd_bytealign (append3[2], append3[1], offset_minus_4); - append3_t[3] = amd_bytealign (append3[3], append3[2], offset_minus_4); - u32x append4_t[4]; - append4_t[0] = amd_bytealign ( 0, append3[3], offset_minus_4); + append0_t[0] = amd_bytealign ( 0, append00, offset); + append0_t[1] = amd_bytealign (append00, append01, offset); + append0_t[2] = amd_bytealign (append01, append02, offset); + append0_t[3] = amd_bytealign (append02, append03, offset); + append1_t[0] = amd_bytealign (append03, append10, offset); + append1_t[1] = amd_bytealign (append10, append11, offset); + append1_t[2] = amd_bytealign (append11, append12, offset); + append1_t[3] = amd_bytealign (append12, append13, offset); + append2_t[0] = amd_bytealign (append13, append20, offset); + append2_t[1] = amd_bytealign (append20, append21, offset); + append2_t[2] = amd_bytealign (append21, append22, offset); + append2_t[3] = amd_bytealign (append22, append23, offset); + append3_t[0] = amd_bytealign (append23, append30, offset); + append3_t[1] = amd_bytealign (append30, append31, offset); + append3_t[2] = amd_bytealign (append31, append32, offset); + append3_t[3] = amd_bytealign (append32, append33, offset); + append4_t[0] = amd_bytealign (append33, 0, offset); append4_t[1] = 0; append4_t[2] = 0; append4_t[3] = 0; - if (mod == 0) - { - append0_t[0] = append0[0]; - append0_t[1] = append0[1]; - append0_t[2] = append0[2]; - append0_t[3] = append0[3]; + append0_t[0] = swap32 (append0_t[0]); + append0_t[1] = swap32 (append0_t[1]); + append0_t[2] = swap32 (append0_t[2]); + append0_t[3] = swap32 (append0_t[3]); + append1_t[0] = swap32 (append1_t[0]); + append1_t[1] = swap32 (append1_t[1]); + append1_t[2] = swap32 (append1_t[2]); + append1_t[3] = swap32 (append1_t[3]); + append2_t[0] = swap32 (append2_t[0]); + append2_t[1] = swap32 (append2_t[1]); + append2_t[2] = swap32 (append2_t[2]); + append2_t[3] = swap32 (append2_t[3]); + append3_t[0] = swap32 (append3_t[0]); + append3_t[1] = swap32 (append3_t[1]); + append3_t[2] = swap32 (append3_t[2]); + append3_t[3] = swap32 (append3_t[3]); + append4_t[0] = swap32 (append4_t[0]); + append4_t[1] = swap32 (append4_t[1]); + append4_t[2] = swap32 (append4_t[2]); + append4_t[3] = swap32 (append4_t[3]); - append1_t[0] = append1[0]; - append1_t[1] = append1[1]; - append1_t[2] = append1[2]; - append1_t[3] = append1[3]; - - append2_t[0] = append2[0]; - append2_t[1] = append2[1]; - append2_t[2] = append2[2]; - append2_t[3] = append2[3]; - - append3_t[0] = append3[0]; - append3_t[1] = append3[1]; - append3_t[2] = append3[2]; - append3_t[3] = append3[3]; - - append4_t[0] = 0; - append4_t[1] = 0; - append4_t[2] = 0; - append4_t[3] = 0; - } #endif #ifdef IS_NV @@ -103,40 +106,50 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + u32x append00 = append0[0]; + u32x append01 = append0[1]; + u32x append02 = append0[2]; + u32x append03 = append0[3]; + u32x append10 = append1[0]; + u32x append11 = append1[1]; + u32x append12 = append1[2]; + u32x append13 = append1[3]; + u32x append20 = append2[0]; + u32x append21 = append2[1]; + u32x append22 = append2[2]; + u32x append23 = append2[3]; + u32x append30 = append3[0]; + u32x append31 = append3[1]; + u32x append32 = append3[2]; + u32x append33 = append3[3]; + u32x append0_t[4]; - - append0_t[0] = __byte_perm ( 0, append0[0], selector); - append0_t[1] = __byte_perm (append0[0], append0[1], selector); - append0_t[2] = __byte_perm (append0[1], append0[2], selector); - append0_t[3] = __byte_perm (append0[2], append0[3], selector); - u32x append1_t[4]; - - append1_t[0] = __byte_perm (append0[3], append1[0], selector); - append1_t[1] = __byte_perm (append1[0], append1[1], selector); - append1_t[2] = __byte_perm (append1[1], append1[2], selector); - append1_t[3] = __byte_perm (append1[2], append1[3], selector); - u32x append2_t[4]; - - append2_t[0] = __byte_perm (append1[3], append2[0], selector); - append2_t[1] = __byte_perm (append2[0], append2[1], selector); - append2_t[2] = __byte_perm (append2[1], append2[2], selector); - append2_t[3] = __byte_perm (append2[2], append2[3], selector); - u32x append3_t[4]; - - append3_t[0] = __byte_perm (append2[3], append3[0], selector); - append3_t[1] = __byte_perm (append3[0], append3[1], selector); - append3_t[2] = __byte_perm (append3[1], append3[2], selector); - append3_t[3] = __byte_perm (append3[2], append3[3], selector); - u32x append4_t[4]; - append4_t[0] = __byte_perm (append3[3], 0, selector); + append0_t[0] = __byte_perm ( 0, append00, selector); + append0_t[1] = __byte_perm (append00, append01, selector); + append0_t[2] = __byte_perm (append01, append02, selector); + append0_t[3] = __byte_perm (append02, append03, selector); + append1_t[0] = __byte_perm (append03, append10, selector); + append1_t[1] = __byte_perm (append10, append11, selector); + append1_t[2] = __byte_perm (append11, append12, selector); + append1_t[3] = __byte_perm (append12, append13, selector); + append2_t[0] = __byte_perm (append13, append20, selector); + append2_t[1] = __byte_perm (append20, append21, selector); + append2_t[2] = __byte_perm (append21, append22, selector); + append2_t[3] = __byte_perm (append22, append23, selector); + append3_t[0] = __byte_perm (append23, append30, selector); + append3_t[1] = __byte_perm (append30, append31, selector); + append3_t[2] = __byte_perm (append31, append32, selector); + append3_t[3] = __byte_perm (append32, append33, selector); + append4_t[0] = __byte_perm (append33, 0, selector); append4_t[1] = 0; append4_t[2] = 0; append4_t[3] = 0; + #endif switch (div) @@ -145,22 +158,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 1] = append0_t[1]; block0[ 2] = append0_t[2]; block0[ 3] = append0_t[3]; - block0[ 4] = append1_t[0]; block0[ 5] = append1_t[1]; block0[ 6] = append1_t[2]; block0[ 7] = append1_t[3]; - block0[ 8] = append2_t[0]; block0[ 9] = append2_t[1]; block0[10] = append2_t[2]; block0[11] = append2_t[3]; - block0[12] = append3_t[0]; block0[13] = append3_t[1]; block0[14] = append3_t[2]; block0[15] = append3_t[3]; - block1[ 0] = append4_t[0]; block1[ 1] = append4_t[1]; block1[ 2] = append4_t[2]; @@ -171,22 +180,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 2] = append0_t[1]; block0[ 3] = append0_t[2]; block0[ 4] = append0_t[3]; - block0[ 5] = append1_t[0]; block0[ 6] = append1_t[1]; block0[ 7] = append1_t[2]; block0[ 8] = append1_t[3]; - block0[ 9] = append2_t[0]; block0[10] = append2_t[1]; block0[11] = append2_t[2]; block0[12] = append2_t[3]; - block0[13] = append3_t[0]; block0[14] = append3_t[1]; block0[15] = append3_t[2]; block1[ 0] = append3_t[3]; - block1[ 1] = append4_t[0]; block1[ 2] = append4_t[1]; block1[ 3] = append4_t[2]; @@ -197,22 +202,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 3] = append0_t[1]; block0[ 4] = append0_t[2]; block0[ 5] = append0_t[3]; - block0[ 6] = append1_t[0]; block0[ 7] = append1_t[1]; block0[ 8] = append1_t[2]; block0[ 9] = append1_t[3]; - block0[10] = append2_t[0]; block0[11] = append2_t[1]; block0[12] = append2_t[2]; block0[13] = append2_t[3]; - block0[14] = append3_t[0]; block0[15] = append3_t[1]; block1[ 0] = append3_t[2]; block1[ 1] = append3_t[3]; - block1[ 2] = append4_t[0]; block1[ 3] = append4_t[1]; block1[ 4] = append4_t[2]; @@ -223,22 +224,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 4] = append0_t[1]; block0[ 5] = append0_t[2]; block0[ 6] = append0_t[3]; - block0[ 7] = append1_t[0]; block0[ 8] = append1_t[1]; block0[ 9] = append1_t[2]; block0[10] = append1_t[3]; - block0[11] = append2_t[0]; block0[12] = append2_t[1]; block0[13] = append2_t[2]; block0[14] = append2_t[3]; - block0[15] = append3_t[0]; block1[ 0] = append3_t[1]; block1[ 1] = append3_t[2]; block1[ 2] = append3_t[3]; - block1[ 3] = append4_t[0]; block1[ 4] = append4_t[1]; block1[ 5] = append4_t[2]; @@ -249,22 +246,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 5] = append0_t[1]; block0[ 6] = append0_t[2]; block0[ 7] = append0_t[3]; - block0[ 8] = append1_t[0]; block0[ 9] = append1_t[1]; block0[10] = append1_t[2]; block0[11] = append1_t[3]; - block0[12] = append2_t[0]; block0[13] = append2_t[1]; block0[14] = append2_t[2]; block0[15] = append2_t[3]; - block1[ 0] = append3_t[0]; block1[ 1] = append3_t[1]; block1[ 2] = append3_t[2]; block1[ 3] = append3_t[3]; - block1[ 4] = append4_t[0]; block1[ 5] = append4_t[1]; block1[ 6] = append4_t[2]; @@ -275,22 +268,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 6] = append0_t[1]; block0[ 7] = append0_t[2]; block0[ 8] = append0_t[3]; - block0[ 9] = append1_t[0]; block0[10] = append1_t[1]; block0[11] = append1_t[2]; block0[12] = append1_t[3]; - block0[13] = append2_t[0]; block0[14] = append2_t[1]; block0[15] = append2_t[2]; block1[ 0] = append2_t[3]; - block1[ 1] = append3_t[0]; block1[ 2] = append3_t[1]; block1[ 3] = append3_t[2]; block1[ 4] = append3_t[3]; - block1[ 5] = append4_t[0]; block1[ 6] = append4_t[1]; block1[ 7] = append4_t[2]; @@ -301,22 +290,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 7] = append0_t[1]; block0[ 8] = append0_t[2]; block0[ 9] = append0_t[3]; - block0[10] = append1_t[0]; block0[11] = append1_t[1]; block0[12] = append1_t[2]; block0[13] = append1_t[3]; - block0[14] = append2_t[0]; block0[15] = append2_t[1]; block1[ 0] = append2_t[2]; block1[ 1] = append2_t[3]; - block1[ 2] = append3_t[0]; block1[ 3] = append3_t[1]; block1[ 4] = append3_t[2]; block1[ 5] = append3_t[3]; - block1[ 6] = append4_t[0]; block1[ 7] = append4_t[1]; block1[ 8] = append4_t[2]; @@ -327,22 +312,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 8] = append0_t[1]; block0[ 9] = append0_t[2]; block0[10] = append0_t[3]; - block0[11] = append1_t[0]; block0[12] = append1_t[1]; block0[13] = append1_t[2]; block0[14] = append1_t[3]; - block0[15] = append2_t[0]; block1[ 0] = append2_t[1]; block1[ 1] = append2_t[2]; block1[ 2] = append2_t[3]; - block1[ 3] = append3_t[0]; block1[ 4] = append3_t[1]; block1[ 5] = append3_t[2]; block1[ 6] = append3_t[3]; - block1[ 7] = append4_t[0]; block1[ 8] = append4_t[1]; block1[ 9] = append4_t[2]; @@ -353,22 +334,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[ 9] = append0_t[1]; block0[10] = append0_t[2]; block0[11] = append0_t[3]; - block0[12] = append1_t[0]; block0[13] = append1_t[1]; block0[14] = append1_t[2]; block0[15] = append1_t[3]; - block1[ 0] = append2_t[0]; block1[ 1] = append2_t[1]; block1[ 2] = append2_t[2]; block1[ 3] = append2_t[3]; - block1[ 4] = append3_t[0]; block1[ 5] = append3_t[1]; block1[ 6] = append3_t[2]; block1[ 7] = append3_t[3]; - block1[ 8] = append4_t[0]; block1[ 9] = append4_t[1]; block1[10] = append4_t[2]; @@ -379,22 +356,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[10] = append0_t[1]; block0[11] = append0_t[2]; block0[12] = append0_t[3]; - block0[13] = append1_t[0]; block0[14] = append1_t[1]; block0[15] = append1_t[2]; block1[ 0] = append1_t[3]; - block1[ 1] = append2_t[0]; block1[ 2] = append2_t[1]; block1[ 3] = append2_t[2]; block1[ 4] = append2_t[3]; - block1[ 5] = append3_t[0]; block1[ 6] = append3_t[1]; block1[ 7] = append3_t[2]; block1[ 8] = append3_t[3]; - block1[ 9] = append4_t[0]; block1[10] = append4_t[1]; block1[11] = append4_t[2]; @@ -405,22 +378,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[11] = append0_t[1]; block0[12] = append0_t[2]; block0[13] = append0_t[3]; - block0[14] = append1_t[0]; block0[15] = append1_t[1]; block1[ 0] = append1_t[2]; block1[ 1] = append1_t[3]; - block1[ 2] = append2_t[0]; block1[ 3] = append2_t[1]; block1[ 4] = append2_t[2]; block1[ 5] = append2_t[3]; - block1[ 6] = append3_t[0]; block1[ 7] = append3_t[1]; block1[ 8] = append3_t[2]; block1[ 9] = append3_t[3]; - block1[10] = append4_t[0]; block1[11] = append4_t[1]; block1[12] = append4_t[2]; @@ -431,22 +400,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[12] = append0_t[1]; block0[13] = append0_t[2]; block0[14] = append0_t[3]; - block0[15] = append1_t[0]; block1[ 0] = append1_t[1]; block1[ 1] = append1_t[2]; block1[ 2] = append1_t[3]; - block1[ 3] = append2_t[0]; block1[ 4] = append2_t[1]; block1[ 5] = append2_t[2]; block1[ 6] = append2_t[3]; - block1[ 7] = append3_t[0]; block1[ 8] = append3_t[1]; block1[ 9] = append3_t[2]; block1[10] = append3_t[3]; - block1[11] = append4_t[0]; block1[12] = append4_t[1]; block1[13] = append4_t[2]; @@ -457,22 +422,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[13] = append0_t[1]; block0[14] = append0_t[2]; block0[15] = append0_t[3]; - block1[ 0] = append1_t[0]; block1[ 1] = append1_t[1]; block1[ 2] = append1_t[2]; block1[ 3] = append1_t[3]; - block1[ 4] = append2_t[0]; block1[ 5] = append2_t[1]; block1[ 6] = append2_t[2]; block1[ 7] = append2_t[3]; - block1[ 8] = append3_t[0]; block1[ 9] = append3_t[1]; block1[10] = append3_t[2]; block1[11] = append3_t[3]; - block1[12] = append4_t[0]; block1[13] = append4_t[1]; block1[14] = append4_t[2]; @@ -483,22 +444,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[14] = append0_t[1]; block0[15] = append0_t[2]; block1[ 0] = append0_t[3]; - block1[ 1] = append1_t[0]; block1[ 2] = append1_t[1]; block1[ 3] = append1_t[2]; block1[ 4] = append1_t[3]; - block1[ 5] = append2_t[0]; block1[ 6] = append2_t[1]; block1[ 7] = append2_t[2]; block1[ 8] = append2_t[3]; - block1[ 9] = append3_t[0]; block1[10] = append3_t[1]; block1[11] = append3_t[2]; block1[12] = append3_t[3]; - block1[13] = append4_t[0]; block1[14] = append4_t[1]; block1[15] = append4_t[2]; @@ -508,22 +465,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block0[15] = append0_t[1]; block1[ 0] = append0_t[2]; block1[ 1] = append0_t[3]; - block1[ 2] = append1_t[0]; block1[ 3] = append1_t[1]; block1[ 4] = append1_t[2]; block1[ 5] = append1_t[3]; - block1[ 6] = append2_t[0]; block1[ 7] = append2_t[1]; block1[ 8] = append2_t[2]; block1[ 9] = append2_t[3]; - block1[10] = append3_t[0]; block1[11] = append3_t[1]; block1[12] = append3_t[2]; block1[13] = append3_t[3]; - block1[14] = append4_t[0]; block1[15] = append4_t[1]; break; @@ -532,22 +485,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 0] = append0_t[1]; block1[ 1] = append0_t[2]; block1[ 2] = append0_t[3]; - block1[ 3] = append1_t[1]; block1[ 4] = append1_t[2]; block1[ 5] = append1_t[3]; block1[ 6] = append1_t[0]; - block1[ 7] = append2_t[0]; block1[ 8] = append2_t[1]; block1[ 9] = append2_t[2]; block1[10] = append2_t[3]; - block1[11] = append3_t[0]; block1[12] = append3_t[1]; block1[13] = append3_t[2]; block1[14] = append3_t[3]; - block1[15] = append4_t[0]; break; @@ -555,17 +504,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 1] = append0_t[1]; block1[ 2] = append0_t[2]; block1[ 3] = append0_t[3]; - block1[ 4] = append1_t[0]; block1[ 5] = append1_t[1]; block1[ 6] = append1_t[2]; block1[ 7] = append1_t[3]; - block1[ 8] = append2_t[0]; block1[ 9] = append2_t[1]; block1[10] = append2_t[2]; block1[11] = append2_t[3]; - block1[12] = append3_t[0]; block1[13] = append3_t[1]; block1[14] = append3_t[2]; @@ -576,17 +522,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 2] = append0_t[1]; block1[ 3] = append0_t[2]; block1[ 4] = append0_t[3]; - block1[ 5] = append1_t[0]; block1[ 6] = append1_t[1]; block1[ 7] = append1_t[2]; block1[ 8] = append1_t[3]; - block1[ 9] = append2_t[0]; block1[10] = append2_t[1]; block1[11] = append2_t[2]; block1[12] = append2_t[3]; - block1[13] = append3_t[0]; block1[14] = append3_t[1]; block1[15] = append3_t[2]; @@ -596,17 +539,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 3] = append0_t[1]; block1[ 4] = append0_t[2]; block1[ 5] = append0_t[3]; - block1[ 6] = append1_t[0]; block1[ 7] = append1_t[1]; block1[ 8] = append1_t[2]; block1[ 9] = append1_t[3]; - block1[10] = append2_t[0]; block1[11] = append2_t[1]; block1[12] = append2_t[2]; block1[13] = append2_t[3]; - block1[14] = append3_t[0]; block1[15] = append3_t[1]; break; @@ -615,17 +555,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 4] = append0_t[1]; block1[ 5] = append0_t[2]; block1[ 6] = append0_t[3]; - block1[ 7] = append1_t[0]; block1[ 8] = append1_t[1]; block1[ 9] = append1_t[2]; block1[10] = append1_t[3]; - block1[11] = append2_t[0]; block1[12] = append2_t[1]; block1[13] = append2_t[2]; block1[14] = append2_t[3]; - block1[15] = append3_t[0]; break; @@ -633,12 +570,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 5] = append0_t[1]; block1[ 6] = append0_t[2]; block1[ 7] = append0_t[3]; - block1[ 8] = append1_t[0]; block1[ 9] = append1_t[1]; block1[10] = append1_t[2]; block1[11] = append1_t[3]; - block1[12] = append2_t[0]; block1[13] = append2_t[1]; block1[14] = append2_t[2]; @@ -649,12 +584,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 6] = append0_t[1]; block1[ 7] = append0_t[2]; block1[ 8] = append0_t[3]; - block1[ 9] = append1_t[0]; block1[10] = append1_t[1]; block1[11] = append1_t[2]; block1[12] = append1_t[3]; - block1[13] = append2_t[0]; block1[14] = append2_t[1]; block1[15] = append2_t[2]; @@ -664,12 +597,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 7] = append0_t[1]; block1[ 8] = append0_t[2]; block1[ 9] = append0_t[3]; - block1[10] = append1_t[0]; block1[11] = append1_t[1]; block1[12] = append1_t[2]; block1[13] = append1_t[3]; - block1[14] = append2_t[0]; block1[15] = append2_t[1]; break; @@ -678,12 +609,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 8] = append0_t[1]; block1[ 9] = append0_t[2]; block1[10] = append0_t[3]; - block1[11] = append1_t[0]; block1[12] = append1_t[1]; block1[13] = append1_t[2]; block1[14] = append1_t[3]; - block1[15] = append2_t[0]; break; @@ -691,7 +620,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[ 9] = append0_t[1]; block1[10] = append0_t[2]; block1[11] = append0_t[3]; - block1[12] = append1_t[0]; block1[13] = append1_t[1]; block1[14] = append1_t[2]; @@ -702,7 +630,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[10] = append0_t[1]; block1[11] = append0_t[2]; block1[12] = append0_t[3]; - block1[13] = append1_t[0]; block1[14] = append1_t[1]; block1[15] = append1_t[2]; @@ -712,7 +639,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[11] = append0_t[1]; block1[12] = append0_t[2]; block1[13] = append0_t[3]; - block1[14] = append1_t[0]; block1[15] = append1_t[1]; break; @@ -721,7 +647,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x block1[12] = append0_t[1]; block1[13] = append0_t[2]; block1[14] = append0_t[3]; - block1[15] = append1_t[0]; break; @@ -741,7 +666,7 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x break; } - u32 new_len = block_len + append_len; + u32 new_len = offset + append_len; return new_len; } diff --git a/OpenCL/m13800_a0-optimized.cl b/OpenCL/m13800_a0-optimized.cl index 03e63e556..c44bc74af 100644 --- a/OpenCL/m13800_a0-optimized.cl +++ b/OpenCL/m13800_a0-optimized.cl @@ -146,6 +146,26 @@ void memcat64c_be (u32x block[16], const u32 offset, u32x carry[16]) u32x tmp15; u32x tmp16; + #if defined IS_AMD || defined IS_GENERIC + tmp00 = amd_bytealign ( 0, carry[ 0], offset); + tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset); + tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset); + tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset); + tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset); + tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset); + tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset); + tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset); + tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset); + tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset); + tmp10 = amd_bytealign (carry[ 9], carry[10], offset); + tmp11 = amd_bytealign (carry[10], carry[11], offset); + tmp12 = amd_bytealign (carry[11], carry[12], offset); + tmp13 = amd_bytealign (carry[12], carry[13], offset); + tmp14 = amd_bytealign (carry[13], carry[14], offset); + tmp15 = amd_bytealign (carry[14], carry[15], offset); + tmp16 = amd_bytealign (carry[15], 0, offset); + #endif + #ifdef IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; @@ -168,26 +188,6 @@ void memcat64c_be (u32x block[16], const u32 offset, u32x carry[16]) tmp16 = __byte_perm ( 0, carry[15], selector); #endif - #if defined IS_AMD || defined IS_GENERIC - tmp00 = amd_bytealign ( 0, carry[ 0], offset); - tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset); - tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset); - tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset); - tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset); - tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset); - tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset); - tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset); - tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset); - tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset); - tmp10 = amd_bytealign (carry[ 9], carry[10], offset); - tmp11 = amd_bytealign (carry[10], carry[11], offset); - tmp12 = amd_bytealign (carry[11], carry[12], offset); - tmp13 = amd_bytealign (carry[12], carry[13], offset); - tmp14 = amd_bytealign (carry[13], carry[14], offset); - tmp15 = amd_bytealign (carry[14], carry[15], offset); - tmp16 = amd_bytealign (carry[15], 0, offset); - #endif - carry[ 0] = 0; carry[ 1] = 0; carry[ 2] = 0; diff --git a/OpenCL/m13800_a1-optimized.cl b/OpenCL/m13800_a1-optimized.cl index 2e557fae5..f6315b732 100644 --- a/OpenCL/m13800_a1-optimized.cl +++ b/OpenCL/m13800_a1-optimized.cl @@ -144,6 +144,26 @@ void memcat64c_be (u32x block[16], const u32 offset, u32x carry[16]) u32x tmp15; u32x tmp16; + #if defined IS_AMD || defined IS_GENERIC + tmp00 = amd_bytealign ( 0, carry[ 0], offset); + tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset); + tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset); + tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset); + tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset); + tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset); + tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset); + tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset); + tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset); + tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset); + tmp10 = amd_bytealign (carry[ 9], carry[10], offset); + tmp11 = amd_bytealign (carry[10], carry[11], offset); + tmp12 = amd_bytealign (carry[11], carry[12], offset); + tmp13 = amd_bytealign (carry[12], carry[13], offset); + tmp14 = amd_bytealign (carry[13], carry[14], offset); + tmp15 = amd_bytealign (carry[14], carry[15], offset); + tmp16 = amd_bytealign (carry[15], 0, offset); + #endif + #ifdef IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; @@ -166,26 +186,6 @@ void memcat64c_be (u32x block[16], const u32 offset, u32x carry[16]) tmp16 = __byte_perm ( 0, carry[15], selector); #endif - #if defined IS_AMD || defined IS_GENERIC - tmp00 = amd_bytealign ( 0, carry[ 0], offset); - tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset); - tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset); - tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset); - tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset); - tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset); - tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset); - tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset); - tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset); - tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset); - tmp10 = amd_bytealign (carry[ 9], carry[10], offset); - tmp11 = amd_bytealign (carry[10], carry[11], offset); - tmp12 = amd_bytealign (carry[11], carry[12], offset); - tmp13 = amd_bytealign (carry[12], carry[13], offset); - tmp14 = amd_bytealign (carry[13], carry[14], offset); - tmp15 = amd_bytealign (carry[14], carry[15], offset); - tmp16 = amd_bytealign (carry[15], 0, offset); - #endif - carry[ 0] = 0; carry[ 1] = 0; carry[ 2] = 0; diff --git a/OpenCL/m13800_a3-optimized.cl b/OpenCL/m13800_a3-optimized.cl index 85818a31e..618f7a130 100644 --- a/OpenCL/m13800_a3-optimized.cl +++ b/OpenCL/m13800_a3-optimized.cl @@ -143,6 +143,26 @@ void memcat64c_be (u32x block[16], const u32 offset, u32x carry[16]) u32x tmp15; u32x tmp16; + #if defined IS_AMD || defined IS_GENERIC + tmp00 = amd_bytealign ( 0, carry[ 0], offset); + tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset); + tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset); + tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset); + tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset); + tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset); + tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset); + tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset); + tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset); + tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset); + tmp10 = amd_bytealign (carry[ 9], carry[10], offset); + tmp11 = amd_bytealign (carry[10], carry[11], offset); + tmp12 = amd_bytealign (carry[11], carry[12], offset); + tmp13 = amd_bytealign (carry[12], carry[13], offset); + tmp14 = amd_bytealign (carry[13], carry[14], offset); + tmp15 = amd_bytealign (carry[14], carry[15], offset); + tmp16 = amd_bytealign (carry[15], 0, offset); + #endif + #ifdef IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; @@ -165,26 +185,6 @@ void memcat64c_be (u32x block[16], const u32 offset, u32x carry[16]) tmp16 = __byte_perm ( 0, carry[15], selector); #endif - #if defined IS_AMD || defined IS_GENERIC - tmp00 = amd_bytealign ( 0, carry[ 0], offset); - tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset); - tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset); - tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset); - tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset); - tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset); - tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset); - tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset); - tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset); - tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset); - tmp10 = amd_bytealign (carry[ 9], carry[10], offset); - tmp11 = amd_bytealign (carry[10], carry[11], offset); - tmp12 = amd_bytealign (carry[11], carry[12], offset); - tmp13 = amd_bytealign (carry[12], carry[13], offset); - tmp14 = amd_bytealign (carry[13], carry[14], offset); - tmp15 = amd_bytealign (carry[14], carry[15], offset); - tmp16 = amd_bytealign (carry[15], 0, offset); - #endif - carry[ 0] = 0; carry[ 1] = 0; carry[ 2] = 0; From 7548e5f85a8e9061c67572e81b40c01baec62618 Mon Sep 17 00:00:00 2001 From: jsteube Date: Sun, 6 Aug 2017 13:54:02 +0200 Subject: [PATCH 66/75] Add pure kernels for PostgreSQL CRAM (MD5) --- OpenCL/m11100_a0.cl | 350 +++++++++++++++++++++++++++++++++++++++ OpenCL/m11100_a1.cl | 326 ++++++++++++++++++++++++++++++++++++ OpenCL/m11100_a3.cl | 390 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1066 insertions(+) create mode 100644 OpenCL/m11100_a0.cl create mode 100644 OpenCL/m11100_a1.cl create mode 100644 OpenCL/m11100_a3.cl diff --git a/OpenCL/m11100_a0.cl b/OpenCL/m11100_a0.cl new file mode 100644 index 000000000..2b73633d9 --- /dev/null +++ b/OpenCL/m11100_a0.cl @@ -0,0 +1,350 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m11100_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * salt + */ + + u32 challenge; + + challenge = salt_bufs[salt_pos].salt_buf[0]; + + u32 salt_buf0[4]; + u32 salt_buf1[4]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[1]; // not a bug, see challenge + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[2]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[3]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[5]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[6]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[7]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[8]; + + const u32 salt_len = salt_bufs[salt_pos].salt_len - 4; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_ctx_t ctx1; + + md5_init (&ctx1); + + md5_update (&ctx1, w, pw_len); + + u32 s0[4]; + u32 s1[4]; + u32 s2[4]; + u32 s3[4]; + + s0[0] = salt_buf0[0]; + s0[1] = salt_buf0[1]; + s0[2] = salt_buf0[2]; + s0[3] = salt_buf0[3]; + s1[0] = salt_buf1[0]; + s1[1] = salt_buf1[1]; + s1[2] = salt_buf1[2]; + s1[3] = salt_buf1[3]; + s2[0] = 0; + s2[1] = 0; + s2[2] = 0; + s2[3] = 0; + s3[0] = 0; + s3[1] = 0; + s3[2] = 0; + s3[3] = 0; + + md5_update_64 (&ctx1, s0, s1, s2, s3, salt_len); + + md5_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + + md5_ctx_t ctx; + + md5_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + ctx.w2[0] = challenge; + ctx.w2[1] = 0; + ctx.w2[2] = 0; + ctx.w2[3] = 0; + ctx.w3[0] = 0; + ctx.w3[1] = 0; + ctx.w3[2] = 0; + ctx.w3[3] = 0; + + ctx.len = 32 + 4; + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m11100_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * salt + */ + + u32 challenge; + + challenge = salt_bufs[salt_pos].salt_buf[0]; + + u32 salt_buf0[4]; + u32 salt_buf1[4]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[1]; // not a bug, see challenge + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[2]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[3]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[5]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[6]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[7]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[8]; + + const u32 salt_len = salt_bufs[salt_pos].salt_len - 4; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_ctx_t ctx1; + + md5_init (&ctx1); + + md5_update (&ctx1, w, pw_len); + + u32 s0[4]; + u32 s1[4]; + u32 s2[4]; + u32 s3[4]; + + s0[0] = salt_buf0[0]; + s0[1] = salt_buf0[1]; + s0[2] = salt_buf0[2]; + s0[3] = salt_buf0[3]; + s1[0] = salt_buf1[0]; + s1[1] = salt_buf1[1]; + s1[2] = salt_buf1[2]; + s1[3] = salt_buf1[3]; + s2[0] = 0; + s2[1] = 0; + s2[2] = 0; + s2[3] = 0; + s3[0] = 0; + s3[1] = 0; + s3[2] = 0; + s3[3] = 0; + + md5_update_64 (&ctx1, s0, s1, s2, s3, salt_len); + + md5_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + + md5_ctx_t ctx; + + md5_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + ctx.w2[0] = challenge; + ctx.w2[1] = 0; + ctx.w2[2] = 0; + ctx.w2[3] = 0; + ctx.w3[0] = 0; + ctx.w3[1] = 0; + ctx.w3[2] = 0; + ctx.w3[3] = 0; + + ctx.len = 32 + 4; + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m11100_a1.cl b/OpenCL/m11100_a1.cl new file mode 100644 index 000000000..a2b234985 --- /dev/null +++ b/OpenCL/m11100_a1.cl @@ -0,0 +1,326 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m11100_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * salt + */ + + u32 challenge; + + challenge = salt_bufs[salt_pos].salt_buf[0]; + + u32 salt_buf0[4]; + u32 salt_buf1[4]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[1]; // not a bug, see challenge + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[2]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[3]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[5]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[6]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[7]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[8]; + + const u32 salt_len = salt_bufs[salt_pos].salt_len - 4; + + /** + * base + */ + + md5_ctx_t ctx0t; + + md5_init (&ctx0t); + + md5_update_global (&ctx0t, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md5_ctx_t ctx1 = ctx0t; + + md5_update_global (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + u32 s0[4]; + u32 s1[4]; + u32 s2[4]; + u32 s3[4]; + + s0[0] = salt_buf0[0]; + s0[1] = salt_buf0[1]; + s0[2] = salt_buf0[2]; + s0[3] = salt_buf0[3]; + s1[0] = salt_buf1[0]; + s1[1] = salt_buf1[1]; + s1[2] = salt_buf1[2]; + s1[3] = salt_buf1[3]; + s2[0] = 0; + s2[1] = 0; + s2[2] = 0; + s2[3] = 0; + s3[0] = 0; + s3[1] = 0; + s3[2] = 0; + s3[3] = 0; + + md5_update_64 (&ctx1, s0, s1, s2, s3, salt_len); + + md5_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + + md5_ctx_t ctx; + + md5_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + ctx.w2[0] = challenge; + ctx.w2[1] = 0; + ctx.w2[2] = 0; + ctx.w2[3] = 0; + ctx.w3[0] = 0; + ctx.w3[1] = 0; + ctx.w3[2] = 0; + ctx.w3[3] = 0; + + ctx.len = 32 + 4; + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m11100_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * salt + */ + + u32 challenge; + + challenge = salt_bufs[salt_pos].salt_buf[0]; + + u32 salt_buf0[4]; + u32 salt_buf1[4]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[1]; // not a bug, see challenge + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[2]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[3]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[5]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[6]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[7]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[8]; + + const u32 salt_len = salt_bufs[salt_pos].salt_len - 4; + + /** + * base + */ + + md5_ctx_t ctx0t; + + md5_init (&ctx0t); + + md5_update_global (&ctx0t, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md5_ctx_t ctx1 = ctx0t; + + md5_update_global (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + u32 s0[4]; + u32 s1[4]; + u32 s2[4]; + u32 s3[4]; + + s0[0] = salt_buf0[0]; + s0[1] = salt_buf0[1]; + s0[2] = salt_buf0[2]; + s0[3] = salt_buf0[3]; + s1[0] = salt_buf1[0]; + s1[1] = salt_buf1[1]; + s1[2] = salt_buf1[2]; + s1[3] = salt_buf1[3]; + s2[0] = 0; + s2[1] = 0; + s2[2] = 0; + s2[3] = 0; + s3[0] = 0; + s3[1] = 0; + s3[2] = 0; + s3[3] = 0; + + md5_update_64 (&ctx1, s0, s1, s2, s3, salt_len); + + md5_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + + md5_ctx_t ctx; + + md5_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + ctx.w2[0] = challenge; + ctx.w2[1] = 0; + ctx.w2[2] = 0; + ctx.w2[3] = 0; + ctx.w3[0] = 0; + ctx.w3[1] = 0; + ctx.w3[2] = 0; + ctx.w3[3] = 0; + + ctx.len = 32 + 4; + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m11100_a3.cl b/OpenCL/m11100_a3.cl new file mode 100644 index 000000000..492d4b9af --- /dev/null +++ b/OpenCL/m11100_a3.cl @@ -0,0 +1,390 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_md5.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m11100_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * salt + */ + + u32 challenge; + + challenge = salt_bufs[salt_pos].salt_buf[0]; + + u32 salt_buf0[4]; + u32 salt_buf1[4]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[1]; // not a bug, see challenge + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[2]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[3]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[5]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[6]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[7]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[8]; + + const u32 salt_len = salt_bufs[salt_pos].salt_len - 4; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0lr = w0l | w0r; + + w[0] = w0lr; + + md5_ctx_vector_t ctx1; + + md5_init_vector (&ctx1); + + md5_update_vector (&ctx1, w, pw_len); + + u32x s0[4]; + u32x s1[4]; + u32x s2[4]; + u32x s3[4]; + + s0[0] = salt_buf0[0]; + s0[1] = salt_buf0[1]; + s0[2] = salt_buf0[2]; + s0[3] = salt_buf0[3]; + s1[0] = salt_buf1[0]; + s1[1] = salt_buf1[1]; + s1[2] = salt_buf1[2]; + s1[3] = salt_buf1[3]; + s2[0] = 0; + s2[1] = 0; + s2[2] = 0; + s2[3] = 0; + s3[0] = 0; + s3[1] = 0; + s3[2] = 0; + s3[3] = 0; + + md5_update_vector_64 (&ctx1, s0, s1, s2, s3, salt_len); + + md5_final_vector (&ctx1); + + const u32x a = ctx1.h[0]; + const u32x b = ctx1.h[1]; + const u32x c = ctx1.h[2]; + const u32x d = ctx1.h[3]; + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + w0[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + w1[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + w1[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + w2[0] = challenge; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_ctx_vector_t ctx; + + md5_init_vector (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + ctx.w2[0] = challenge; + ctx.w2[1] = 0; + ctx.w2[2] = 0; + ctx.w2[3] = 0; + ctx.w3[0] = 0; + ctx.w3[1] = 0; + ctx.w3[2] = 0; + ctx.w3[3] = 0; + + ctx.len = 32 + 4; + + md5_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m11100_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * salt + */ + + u32 challenge; + + challenge = salt_bufs[salt_pos].salt_buf[0]; + + u32 salt_buf0[4]; + u32 salt_buf1[4]; + + salt_buf0[0] = salt_bufs[salt_pos].salt_buf[1]; // not a bug, see challenge + salt_buf0[1] = salt_bufs[salt_pos].salt_buf[2]; + salt_buf0[2] = salt_bufs[salt_pos].salt_buf[3]; + salt_buf0[3] = salt_bufs[salt_pos].salt_buf[4]; + salt_buf1[0] = salt_bufs[salt_pos].salt_buf[5]; + salt_buf1[1] = salt_bufs[salt_pos].salt_buf[6]; + salt_buf1[2] = salt_bufs[salt_pos].salt_buf[7]; + salt_buf1[3] = salt_bufs[salt_pos].salt_buf[8]; + + const u32 salt_len = salt_bufs[salt_pos].salt_len - 4; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0lr = w0l | w0r; + + w[0] = w0lr; + + md5_ctx_vector_t ctx1; + + md5_init_vector (&ctx1); + + md5_update_vector (&ctx1, w, pw_len); + + u32x s0[4]; + u32x s1[4]; + u32x s2[4]; + u32x s3[4]; + + s0[0] = salt_buf0[0]; + s0[1] = salt_buf0[1]; + s0[2] = salt_buf0[2]; + s0[3] = salt_buf0[3]; + s1[0] = salt_buf1[0]; + s1[1] = salt_buf1[1]; + s1[2] = salt_buf1[2]; + s1[3] = salt_buf1[3]; + s2[0] = 0; + s2[1] = 0; + s2[2] = 0; + s2[3] = 0; + s3[0] = 0; + s3[1] = 0; + s3[2] = 0; + s3[3] = 0; + + md5_update_vector_64 (&ctx1, s0, s1, s2, s3, salt_len); + + md5_final_vector (&ctx1); + + const u32x a = ctx1.h[0]; + const u32x b = ctx1.h[1]; + const u32x c = ctx1.h[2]; + const u32x d = ctx1.h[3]; + + md5_ctx_vector_t ctx; + + md5_init_vector (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + ctx.w2[0] = challenge; + ctx.w2[1] = 0; + ctx.w2[2] = 0; + ctx.w2[3] = 0; + ctx.w3[0] = 0; + ctx.w3[1] = 0; + ctx.w3[2] = 0; + ctx.w3[3] = 0; + + ctx.len = 32 + 4; + + md5_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} From b1f9ed4a7cc7e41918e833c9840cc8e22ac3a77f Mon Sep 17 00:00:00 2001 From: jsteube Date: Sun, 6 Aug 2017 15:33:38 +0200 Subject: [PATCH 67/75] Add pure kernels for MySQL CRAM (SHA1) --- OpenCL/m11200_a0.cl | 278 ++++++++++++++++++++++++++++++++++++++++ OpenCL/m11200_a1.cl | 254 ++++++++++++++++++++++++++++++++++++ OpenCL/m11200_a3.cl | 304 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 836 insertions(+) create mode 100644 OpenCL/m11200_a0.cl create mode 100644 OpenCL/m11200_a1.cl create mode 100644 OpenCL/m11200_a3.cl diff --git a/OpenCL/m11200_a0.cl b/OpenCL/m11200_a0.cl new file mode 100644 index 000000000..c416f9b6a --- /dev/null +++ b/OpenCL/m11200_a0.cl @@ -0,0 +1,278 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m11200_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx2; + + sha1_init (&ctx2); + + sha1_update_swap (&ctx2, w, pw_len); + + sha1_final (&ctx2); + + u32 a = ctx2.h[0]; + u32 b = ctx2.h[1]; + u32 c = ctx2.h[2]; + u32 d = ctx2.h[3]; + u32 e = ctx2.h[4]; + + const u32 a_sav = a; + const u32 b_sav = b; + const u32 c_sav = c; + const u32 d_sav = d; + const u32 e_sav = e; + + sha1_ctx_t ctx1; + + sha1_init (&ctx1); + + ctx1.w0[0] = a; + ctx1.w0[1] = b; + ctx1.w0[2] = c; + ctx1.w0[3] = d; + ctx1.w1[0] = e; + + ctx1.len = 20; + + sha1_final (&ctx1); + + a = ctx1.h[0]; + b = ctx1.h[1]; + c = ctx1.h[2]; + d = ctx1.h[3]; + e = ctx1.h[4]; + + sha1_ctx_t ctx = ctx0; + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = a; + w0[1] = b; + w0[2] = c; + w0[3] = d; + w1[0] = e; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx, w0, w1, w2, w3, 20); + + sha1_final (&ctx); + + ctx.h[0] ^= a_sav; + ctx.h[1] ^= b_sav; + ctx.h[2] ^= c_sav; + ctx.h[3] ^= d_sav; + ctx.h[4] ^= e_sav; + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m11200_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx2; + + sha1_init (&ctx2); + + sha1_update_swap (&ctx2, w, pw_len); + + sha1_final (&ctx2); + + u32 a = ctx2.h[0]; + u32 b = ctx2.h[1]; + u32 c = ctx2.h[2]; + u32 d = ctx2.h[3]; + u32 e = ctx2.h[4]; + + const u32 a_sav = a; + const u32 b_sav = b; + const u32 c_sav = c; + const u32 d_sav = d; + const u32 e_sav = e; + + sha1_ctx_t ctx1; + + sha1_init (&ctx1); + + ctx1.w0[0] = a; + ctx1.w0[1] = b; + ctx1.w0[2] = c; + ctx1.w0[3] = d; + ctx1.w1[0] = e; + + ctx1.len = 20; + + sha1_final (&ctx1); + + a = ctx1.h[0]; + b = ctx1.h[1]; + c = ctx1.h[2]; + d = ctx1.h[3]; + e = ctx1.h[4]; + + sha1_ctx_t ctx = ctx0; + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = a; + w0[1] = b; + w0[2] = c; + w0[3] = d; + w1[0] = e; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx, w0, w1, w2, w3, 20); + + sha1_final (&ctx); + + ctx.h[0] ^= a_sav; + ctx.h[1] ^= b_sav; + ctx.h[2] ^= c_sav; + ctx.h[3] ^= d_sav; + ctx.h[4] ^= e_sav; + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m11200_a1.cl b/OpenCL/m11200_a1.cl new file mode 100644 index 000000000..ef5836abf --- /dev/null +++ b/OpenCL/m11200_a1.cl @@ -0,0 +1,254 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m11200_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + sha1_ctx_t ctx2l; + + sha1_init (&ctx2l); + + sha1_update_global_swap (&ctx2l, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx2 = ctx2l; + + sha1_update_global_swap (&ctx2, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx2); + + u32 a = ctx2.h[0]; + u32 b = ctx2.h[1]; + u32 c = ctx2.h[2]; + u32 d = ctx2.h[3]; + u32 e = ctx2.h[4]; + + const u32 a_sav = a; + const u32 b_sav = b; + const u32 c_sav = c; + const u32 d_sav = d; + const u32 e_sav = e; + + sha1_ctx_t ctx1; + + sha1_init (&ctx1); + + ctx1.w0[0] = a; + ctx1.w0[1] = b; + ctx1.w0[2] = c; + ctx1.w0[3] = d; + ctx1.w1[0] = e; + + ctx1.len = 20; + + sha1_final (&ctx1); + + a = ctx1.h[0]; + b = ctx1.h[1]; + c = ctx1.h[2]; + d = ctx1.h[3]; + e = ctx1.h[4]; + + sha1_ctx_t ctx = ctx0; + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = a; + w0[1] = b; + w0[2] = c; + w0[3] = d; + w1[0] = e; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx, w0, w1, w2, w3, 20); + + sha1_final (&ctx); + + ctx.h[0] ^= a_sav; + ctx.h[1] ^= b_sav; + ctx.h[2] ^= c_sav; + ctx.h[3] ^= d_sav; + ctx.h[4] ^= e_sav; + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m11200_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + sha1_ctx_t ctx2l; + + sha1_init (&ctx2l); + + sha1_update_global_swap (&ctx2l, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx2 = ctx2l; + + sha1_update_global_swap (&ctx2, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx2); + + u32 a = ctx2.h[0]; + u32 b = ctx2.h[1]; + u32 c = ctx2.h[2]; + u32 d = ctx2.h[3]; + u32 e = ctx2.h[4]; + + const u32 a_sav = a; + const u32 b_sav = b; + const u32 c_sav = c; + const u32 d_sav = d; + const u32 e_sav = e; + + sha1_ctx_t ctx1; + + sha1_init (&ctx1); + + ctx1.w0[0] = a; + ctx1.w0[1] = b; + ctx1.w0[2] = c; + ctx1.w0[3] = d; + ctx1.w1[0] = e; + + ctx1.len = 20; + + sha1_final (&ctx1); + + a = ctx1.h[0]; + b = ctx1.h[1]; + c = ctx1.h[2]; + d = ctx1.h[3]; + e = ctx1.h[4]; + + sha1_ctx_t ctx = ctx0; + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = a; + w0[1] = b; + w0[2] = c; + w0[3] = d; + w1[0] = e; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_64 (&ctx, w0, w1, w2, w3, 20); + + sha1_final (&ctx); + + ctx.h[0] ^= a_sav; + ctx.h[1] ^= b_sav; + ctx.h[2] ^= c_sav; + ctx.h[3] ^= d_sav; + ctx.h[4] ^= e_sav; + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m11200_a3.cl b/OpenCL/m11200_a3.cl new file mode 100644 index 000000000..d162f3690 --- /dev/null +++ b/OpenCL/m11200_a3.cl @@ -0,0 +1,304 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_sha1.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m11200_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0lr = w0l | w0r; + + w[0] = w0lr; + + sha1_ctx_vector_t ctx2; + + sha1_init_vector (&ctx2); + + sha1_update_vector (&ctx2, w, pw_len); + + sha1_final_vector (&ctx2); + + u32x a = ctx2.h[0]; + u32x b = ctx2.h[1]; + u32x c = ctx2.h[2]; + u32x d = ctx2.h[3]; + u32x e = ctx2.h[4]; + + const u32x a_sav = a; + const u32x b_sav = b; + const u32x c_sav = c; + const u32x d_sav = d; + const u32x e_sav = e; + + sha1_ctx_vector_t ctx1; + + sha1_init_vector (&ctx1); + + ctx1.w0[0] = a; + ctx1.w0[1] = b; + ctx1.w0[2] = c; + ctx1.w0[3] = d; + ctx1.w1[0] = e; + + ctx1.len = 20; + + sha1_final_vector (&ctx1); + + a = ctx1.h[0]; + b = ctx1.h[1]; + c = ctx1.h[2]; + d = ctx1.h[3]; + e = ctx1.h[4]; + + sha1_ctx_vector_t ctx; + + sha1_init_vector_from_scalar (&ctx, &ctx0); + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = a; + w0[1] = b; + w0[2] = c; + w0[3] = d; + w1[0] = e; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_vector_64 (&ctx, w0, w1, w2, w3, 20); + + sha1_final_vector (&ctx); + + ctx.h[0] ^= a_sav; + ctx.h[1] ^= b_sav; + ctx.h[2] ^= c_sav; + ctx.h[3] ^= d_sav; + ctx.h[4] ^= e_sav; + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m11200_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0lr = w0l | w0r; + + w[0] = w0lr; + + sha1_ctx_vector_t ctx2; + + sha1_init_vector (&ctx2); + + sha1_update_vector (&ctx2, w, pw_len); + + sha1_final_vector (&ctx2); + + u32x a = ctx2.h[0]; + u32x b = ctx2.h[1]; + u32x c = ctx2.h[2]; + u32x d = ctx2.h[3]; + u32x e = ctx2.h[4]; + + const u32x a_sav = a; + const u32x b_sav = b; + const u32x c_sav = c; + const u32x d_sav = d; + const u32x e_sav = e; + + sha1_ctx_vector_t ctx1; + + sha1_init_vector (&ctx1); + + ctx1.w0[0] = a; + ctx1.w0[1] = b; + ctx1.w0[2] = c; + ctx1.w0[3] = d; + ctx1.w1[0] = e; + + ctx1.len = 20; + + sha1_final_vector (&ctx1); + + a = ctx1.h[0]; + b = ctx1.h[1]; + c = ctx1.h[2]; + d = ctx1.h[3]; + e = ctx1.h[4]; + + sha1_ctx_vector_t ctx; + + sha1_init_vector_from_scalar (&ctx, &ctx0); + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = a; + w0[1] = b; + w0[2] = c; + w0[3] = d; + w1[0] = e; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + sha1_update_vector_64 (&ctx, w0, w1, w2, w3, 20); + + sha1_final_vector (&ctx); + + ctx.h[0] ^= a_sav; + ctx.h[1] ^= b_sav; + ctx.h[2] ^= c_sav; + ctx.h[3] ^= d_sav; + ctx.h[4] ^= e_sav; + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} From 4f72c8bee6c543c18e272e187e52ec67b9a41b8b Mon Sep 17 00:00:00 2001 From: jsteube Date: Mon, 7 Aug 2017 13:39:17 +0200 Subject: [PATCH 68/75] Add pure kernels for SIP digest authentication (MD5) --- OpenCL/inc_types.cl | 4 +- OpenCL/m11400_a0-optimized.cl | 2186 ------------ OpenCL/m11400_a0.cl | 254 ++ OpenCL/m11400_a1-optimized.cl | 2300 ------------- OpenCL/m11400_a1.cl | 226 ++ OpenCL/m11400_a3-optimized.cl | 5968 --------------------------------- OpenCL/m11400_a3.cl | 294 ++ include/interface.h | 4 +- 8 files changed, 778 insertions(+), 10458 deletions(-) delete mode 100644 OpenCL/m11400_a0-optimized.cl create mode 100644 OpenCL/m11400_a0.cl delete mode 100644 OpenCL/m11400_a1-optimized.cl create mode 100644 OpenCL/m11400_a1.cl delete mode 100644 OpenCL/m11400_a3-optimized.cl create mode 100644 OpenCL/m11400_a3.cl diff --git a/OpenCL/inc_types.cl b/OpenCL/inc_types.cl index c31bc0046..764ffd9c1 100644 --- a/OpenCL/inc_types.cl +++ b/OpenCL/inc_types.cl @@ -791,10 +791,10 @@ typedef struct bitcoin_wallet typedef struct sip { - u32 salt_buf[30]; + u32 salt_buf[32]; u32 salt_len; - u32 esalt_buf[38]; + u32 esalt_buf[48]; u32 esalt_len; } sip_t; diff --git a/OpenCL/m11400_a0-optimized.cl b/OpenCL/m11400_a0-optimized.cl deleted file mode 100644 index 4e740b646..000000000 --- a/OpenCL/m11400_a0-optimized.cl +++ /dev/null @@ -1,2186 +0,0 @@ -/** - * Author......: See docs/credits.txt - * License.....: MIT - */ - -//incompatible because of brances -//#define NEW_SIMD_CODE - -#include "inc_vendor.cl" -#include "inc_hash_constants.h" -#include "inc_hash_functions.cl" -#include "inc_types.cl" -#include "inc_common.cl" -#include "inc_rp.h" -#include "inc_rp.cl" -#include "inc_simd.cl" - -#if VECT_SIZE == 1 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) -#elif VECT_SIZE == 2 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) -#elif VECT_SIZE == 4 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) -#elif VECT_SIZE == 8 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) -#elif VECT_SIZE == 16 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) -#endif - -u32 memcat32 (u32x block0[16], u32x block1[16], const u32 offset, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len) -{ - const u32 mod = offset & 3; - const u32 div = offset / 4; - - #if defined IS_AMD || defined IS_GENERIC - const int offset_minus_4 = 4 - mod; - - u32x append00 = swap32 (append0[0]); - u32x append01 = swap32 (append0[1]); - u32x append02 = swap32 (append0[2]); - u32x append03 = swap32 (append0[3]); - u32x append10 = swap32 (append1[0]); - u32x append11 = swap32 (append1[1]); - u32x append12 = swap32 (append1[2]); - u32x append13 = swap32 (append1[3]); - u32x append20 = swap32 (append2[0]); - u32x append21 = swap32 (append2[1]); - u32x append22 = swap32 (append2[2]); - u32x append23 = swap32 (append2[3]); - u32x append30 = swap32 (append3[0]); - u32x append31 = swap32 (append3[1]); - u32x append32 = swap32 (append3[2]); - u32x append33 = swap32 (append3[3]); - - u32x append0_t[4]; - u32x append1_t[4]; - u32x append2_t[4]; - u32x append3_t[4]; - u32x append4_t[4]; - - append0_t[0] = amd_bytealign ( 0, append00, offset); - append0_t[1] = amd_bytealign (append00, append01, offset); - append0_t[2] = amd_bytealign (append01, append02, offset); - append0_t[3] = amd_bytealign (append02, append03, offset); - append1_t[0] = amd_bytealign (append03, append10, offset); - append1_t[1] = amd_bytealign (append10, append11, offset); - append1_t[2] = amd_bytealign (append11, append12, offset); - append1_t[3] = amd_bytealign (append12, append13, offset); - append2_t[0] = amd_bytealign (append13, append20, offset); - append2_t[1] = amd_bytealign (append20, append21, offset); - append2_t[2] = amd_bytealign (append21, append22, offset); - append2_t[3] = amd_bytealign (append22, append23, offset); - append3_t[0] = amd_bytealign (append23, append30, offset); - append3_t[1] = amd_bytealign (append30, append31, offset); - append3_t[2] = amd_bytealign (append31, append32, offset); - append3_t[3] = amd_bytealign (append32, append33, offset); - append4_t[0] = amd_bytealign (append33, 0, offset); - append4_t[1] = 0; - append4_t[2] = 0; - append4_t[3] = 0; - - append0_t[0] = swap32 (append0_t[0]); - append0_t[1] = swap32 (append0_t[1]); - append0_t[2] = swap32 (append0_t[2]); - append0_t[3] = swap32 (append0_t[3]); - append1_t[0] = swap32 (append1_t[0]); - append1_t[1] = swap32 (append1_t[1]); - append1_t[2] = swap32 (append1_t[2]); - append1_t[3] = swap32 (append1_t[3]); - append2_t[0] = swap32 (append2_t[0]); - append2_t[1] = swap32 (append2_t[1]); - append2_t[2] = swap32 (append2_t[2]); - append2_t[3] = swap32 (append2_t[3]); - append3_t[0] = swap32 (append3_t[0]); - append3_t[1] = swap32 (append3_t[1]); - append3_t[2] = swap32 (append3_t[2]); - append3_t[3] = swap32 (append3_t[3]); - append4_t[0] = swap32 (append4_t[0]); - append4_t[1] = swap32 (append4_t[1]); - append4_t[2] = swap32 (append4_t[2]); - append4_t[3] = swap32 (append4_t[3]); - - #endif - - #ifdef IS_NV - - const int offset_minus_4 = 4 - mod; - - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - - u32x append00 = append0[0]; - u32x append01 = append0[1]; - u32x append02 = append0[2]; - u32x append03 = append0[3]; - u32x append10 = append1[0]; - u32x append11 = append1[1]; - u32x append12 = append1[2]; - u32x append13 = append1[3]; - u32x append20 = append2[0]; - u32x append21 = append2[1]; - u32x append22 = append2[2]; - u32x append23 = append2[3]; - u32x append30 = append3[0]; - u32x append31 = append3[1]; - u32x append32 = append3[2]; - u32x append33 = append3[3]; - - u32x append0_t[4]; - u32x append1_t[4]; - u32x append2_t[4]; - u32x append3_t[4]; - u32x append4_t[4]; - - append0_t[0] = __byte_perm ( 0, append00, selector); - append0_t[1] = __byte_perm (append00, append01, selector); - append0_t[2] = __byte_perm (append01, append02, selector); - append0_t[3] = __byte_perm (append02, append03, selector); - append1_t[0] = __byte_perm (append03, append10, selector); - append1_t[1] = __byte_perm (append10, append11, selector); - append1_t[2] = __byte_perm (append11, append12, selector); - append1_t[3] = __byte_perm (append12, append13, selector); - append2_t[0] = __byte_perm (append13, append20, selector); - append2_t[1] = __byte_perm (append20, append21, selector); - append2_t[2] = __byte_perm (append21, append22, selector); - append2_t[3] = __byte_perm (append22, append23, selector); - append3_t[0] = __byte_perm (append23, append30, selector); - append3_t[1] = __byte_perm (append30, append31, selector); - append3_t[2] = __byte_perm (append31, append32, selector); - append3_t[3] = __byte_perm (append32, append33, selector); - append4_t[0] = __byte_perm (append33, 0, selector); - append4_t[1] = 0; - append4_t[2] = 0; - append4_t[3] = 0; - - #endif - - switch (div) - { - case 0: block0[ 0] |= append0_t[0]; - block0[ 1] = append0_t[1]; - block0[ 2] = append0_t[2]; - block0[ 3] = append0_t[3]; - block0[ 4] = append1_t[0]; - block0[ 5] = append1_t[1]; - block0[ 6] = append1_t[2]; - block0[ 7] = append1_t[3]; - block0[ 8] = append2_t[0]; - block0[ 9] = append2_t[1]; - block0[10] = append2_t[2]; - block0[11] = append2_t[3]; - block0[12] = append3_t[0]; - block0[13] = append3_t[1]; - block0[14] = append3_t[2]; - block0[15] = append3_t[3]; - block1[ 0] = append4_t[0]; - block1[ 1] = append4_t[1]; - block1[ 2] = append4_t[2]; - block1[ 3] = append4_t[3]; - break; - - case 1: block0[ 1] |= append0_t[0]; - block0[ 2] = append0_t[1]; - block0[ 3] = append0_t[2]; - block0[ 4] = append0_t[3]; - block0[ 5] = append1_t[0]; - block0[ 6] = append1_t[1]; - block0[ 7] = append1_t[2]; - block0[ 8] = append1_t[3]; - block0[ 9] = append2_t[0]; - block0[10] = append2_t[1]; - block0[11] = append2_t[2]; - block0[12] = append2_t[3]; - block0[13] = append3_t[0]; - block0[14] = append3_t[1]; - block0[15] = append3_t[2]; - block1[ 0] = append3_t[3]; - block1[ 1] = append4_t[0]; - block1[ 2] = append4_t[1]; - block1[ 3] = append4_t[2]; - block1[ 4] = append4_t[3]; - break; - - case 2: block0[ 2] |= append0_t[0]; - block0[ 3] = append0_t[1]; - block0[ 4] = append0_t[2]; - block0[ 5] = append0_t[3]; - block0[ 6] = append1_t[0]; - block0[ 7] = append1_t[1]; - block0[ 8] = append1_t[2]; - block0[ 9] = append1_t[3]; - block0[10] = append2_t[0]; - block0[11] = append2_t[1]; - block0[12] = append2_t[2]; - block0[13] = append2_t[3]; - block0[14] = append3_t[0]; - block0[15] = append3_t[1]; - block1[ 0] = append3_t[2]; - block1[ 1] = append3_t[3]; - block1[ 2] = append4_t[0]; - block1[ 3] = append4_t[1]; - block1[ 4] = append4_t[2]; - block1[ 5] = append4_t[3]; - break; - - case 3: block0[ 3] |= append0_t[0]; - block0[ 4] = append0_t[1]; - block0[ 5] = append0_t[2]; - block0[ 6] = append0_t[3]; - block0[ 7] = append1_t[0]; - block0[ 8] = append1_t[1]; - block0[ 9] = append1_t[2]; - block0[10] = append1_t[3]; - block0[11] = append2_t[0]; - block0[12] = append2_t[1]; - block0[13] = append2_t[2]; - block0[14] = append2_t[3]; - block0[15] = append3_t[0]; - block1[ 0] = append3_t[1]; - block1[ 1] = append3_t[2]; - block1[ 2] = append3_t[3]; - block1[ 3] = append4_t[0]; - block1[ 4] = append4_t[1]; - block1[ 5] = append4_t[2]; - block1[ 6] = append4_t[3]; - break; - - case 4: block0[ 4] |= append0_t[0]; - block0[ 5] = append0_t[1]; - block0[ 6] = append0_t[2]; - block0[ 7] = append0_t[3]; - block0[ 8] = append1_t[0]; - block0[ 9] = append1_t[1]; - block0[10] = append1_t[2]; - block0[11] = append1_t[3]; - block0[12] = append2_t[0]; - block0[13] = append2_t[1]; - block0[14] = append2_t[2]; - block0[15] = append2_t[3]; - block1[ 0] = append3_t[0]; - block1[ 1] = append3_t[1]; - block1[ 2] = append3_t[2]; - block1[ 3] = append3_t[3]; - block1[ 4] = append4_t[0]; - block1[ 5] = append4_t[1]; - block1[ 6] = append4_t[2]; - block1[ 7] = append4_t[3]; - break; - - case 5: block0[ 5] |= append0_t[0]; - block0[ 6] = append0_t[1]; - block0[ 7] = append0_t[2]; - block0[ 8] = append0_t[3]; - block0[ 9] = append1_t[0]; - block0[10] = append1_t[1]; - block0[11] = append1_t[2]; - block0[12] = append1_t[3]; - block0[13] = append2_t[0]; - block0[14] = append2_t[1]; - block0[15] = append2_t[2]; - block1[ 0] = append2_t[3]; - block1[ 1] = append3_t[0]; - block1[ 2] = append3_t[1]; - block1[ 3] = append3_t[2]; - block1[ 4] = append3_t[3]; - block1[ 5] = append4_t[0]; - block1[ 6] = append4_t[1]; - block1[ 7] = append4_t[2]; - block1[ 8] = append4_t[3]; - break; - - case 6: block0[ 6] |= append0_t[0]; - block0[ 7] = append0_t[1]; - block0[ 8] = append0_t[2]; - block0[ 9] = append0_t[3]; - block0[10] = append1_t[0]; - block0[11] = append1_t[1]; - block0[12] = append1_t[2]; - block0[13] = append1_t[3]; - block0[14] = append2_t[0]; - block0[15] = append2_t[1]; - block1[ 0] = append2_t[2]; - block1[ 1] = append2_t[3]; - block1[ 2] = append3_t[0]; - block1[ 3] = append3_t[1]; - block1[ 4] = append3_t[2]; - block1[ 5] = append3_t[3]; - block1[ 6] = append4_t[0]; - block1[ 7] = append4_t[1]; - block1[ 8] = append4_t[2]; - block1[ 9] = append4_t[3]; - break; - - case 7: block0[ 7] |= append0_t[0]; - block0[ 8] = append0_t[1]; - block0[ 9] = append0_t[2]; - block0[10] = append0_t[3]; - block0[11] = append1_t[0]; - block0[12] = append1_t[1]; - block0[13] = append1_t[2]; - block0[14] = append1_t[3]; - block0[15] = append2_t[0]; - block1[ 0] = append2_t[1]; - block1[ 1] = append2_t[2]; - block1[ 2] = append2_t[3]; - block1[ 3] = append3_t[0]; - block1[ 4] = append3_t[1]; - block1[ 5] = append3_t[2]; - block1[ 6] = append3_t[3]; - block1[ 7] = append4_t[0]; - block1[ 8] = append4_t[1]; - block1[ 9] = append4_t[2]; - block1[10] = append4_t[3]; - break; - - case 8: block0[ 8] |= append0_t[0]; - block0[ 9] = append0_t[1]; - block0[10] = append0_t[2]; - block0[11] = append0_t[3]; - block0[12] = append1_t[0]; - block0[13] = append1_t[1]; - block0[14] = append1_t[2]; - block0[15] = append1_t[3]; - block1[ 0] = append2_t[0]; - block1[ 1] = append2_t[1]; - block1[ 2] = append2_t[2]; - block1[ 3] = append2_t[3]; - block1[ 4] = append3_t[0]; - block1[ 5] = append3_t[1]; - block1[ 6] = append3_t[2]; - block1[ 7] = append3_t[3]; - block1[ 8] = append4_t[0]; - block1[ 9] = append4_t[1]; - block1[10] = append4_t[2]; - block1[11] = append4_t[3]; - break; - - case 9: block0[ 9] |= append0_t[0]; - block0[10] = append0_t[1]; - block0[11] = append0_t[2]; - block0[12] = append0_t[3]; - block0[13] = append1_t[0]; - block0[14] = append1_t[1]; - block0[15] = append1_t[2]; - block1[ 0] = append1_t[3]; - block1[ 1] = append2_t[0]; - block1[ 2] = append2_t[1]; - block1[ 3] = append2_t[2]; - block1[ 4] = append2_t[3]; - block1[ 5] = append3_t[0]; - block1[ 6] = append3_t[1]; - block1[ 7] = append3_t[2]; - block1[ 8] = append3_t[3]; - block1[ 9] = append4_t[0]; - block1[10] = append4_t[1]; - block1[11] = append4_t[2]; - block1[12] = append4_t[3]; - break; - - case 10: block0[10] |= append0_t[0]; - block0[11] = append0_t[1]; - block0[12] = append0_t[2]; - block0[13] = append0_t[3]; - block0[14] = append1_t[0]; - block0[15] = append1_t[1]; - block1[ 0] = append1_t[2]; - block1[ 1] = append1_t[3]; - block1[ 2] = append2_t[0]; - block1[ 3] = append2_t[1]; - block1[ 4] = append2_t[2]; - block1[ 5] = append2_t[3]; - block1[ 6] = append3_t[0]; - block1[ 7] = append3_t[1]; - block1[ 8] = append3_t[2]; - block1[ 9] = append3_t[3]; - block1[10] = append4_t[0]; - block1[11] = append4_t[1]; - block1[12] = append4_t[2]; - block1[13] = append4_t[3]; - break; - - case 11: block0[11] |= append0_t[0]; - block0[12] = append0_t[1]; - block0[13] = append0_t[2]; - block0[14] = append0_t[3]; - block0[15] = append1_t[0]; - block1[ 0] = append1_t[1]; - block1[ 1] = append1_t[2]; - block1[ 2] = append1_t[3]; - block1[ 3] = append2_t[0]; - block1[ 4] = append2_t[1]; - block1[ 5] = append2_t[2]; - block1[ 6] = append2_t[3]; - block1[ 7] = append3_t[0]; - block1[ 8] = append3_t[1]; - block1[ 9] = append3_t[2]; - block1[10] = append3_t[3]; - block1[11] = append4_t[0]; - block1[12] = append4_t[1]; - block1[13] = append4_t[2]; - block1[14] = append4_t[3]; - break; - - case 12: block0[12] |= append0_t[0]; - block0[13] = append0_t[1]; - block0[14] = append0_t[2]; - block0[15] = append0_t[3]; - block1[ 0] = append1_t[0]; - block1[ 1] = append1_t[1]; - block1[ 2] = append1_t[2]; - block1[ 3] = append1_t[3]; - block1[ 4] = append2_t[0]; - block1[ 5] = append2_t[1]; - block1[ 6] = append2_t[2]; - block1[ 7] = append2_t[3]; - block1[ 8] = append3_t[0]; - block1[ 9] = append3_t[1]; - block1[10] = append3_t[2]; - block1[11] = append3_t[3]; - block1[12] = append4_t[0]; - block1[13] = append4_t[1]; - block1[14] = append4_t[2]; - block1[15] = append4_t[3]; - break; - - case 13: block0[13] |= append0_t[0]; - block0[14] = append0_t[1]; - block0[15] = append0_t[2]; - block1[ 0] = append0_t[3]; - block1[ 1] = append1_t[0]; - block1[ 2] = append1_t[1]; - block1[ 3] = append1_t[2]; - block1[ 4] = append1_t[3]; - block1[ 5] = append2_t[0]; - block1[ 6] = append2_t[1]; - block1[ 7] = append2_t[2]; - block1[ 8] = append2_t[3]; - block1[ 9] = append3_t[0]; - block1[10] = append3_t[1]; - block1[11] = append3_t[2]; - block1[12] = append3_t[3]; - block1[13] = append4_t[0]; - block1[14] = append4_t[1]; - block1[15] = append4_t[2]; - break; - - case 14: block0[14] |= append0_t[0]; - block0[15] = append0_t[1]; - block1[ 0] = append0_t[2]; - block1[ 1] = append0_t[3]; - block1[ 2] = append1_t[0]; - block1[ 3] = append1_t[1]; - block1[ 4] = append1_t[2]; - block1[ 5] = append1_t[3]; - block1[ 6] = append2_t[0]; - block1[ 7] = append2_t[1]; - block1[ 8] = append2_t[2]; - block1[ 9] = append2_t[3]; - block1[10] = append3_t[0]; - block1[11] = append3_t[1]; - block1[12] = append3_t[2]; - block1[13] = append3_t[3]; - block1[14] = append4_t[0]; - block1[15] = append4_t[1]; - break; - - case 15: block0[15] |= append0_t[0]; - block1[ 0] = append0_t[1]; - block1[ 1] = append0_t[2]; - block1[ 2] = append0_t[3]; - block1[ 3] = append1_t[1]; - block1[ 4] = append1_t[2]; - block1[ 5] = append1_t[3]; - block1[ 6] = append1_t[0]; - block1[ 7] = append2_t[0]; - block1[ 8] = append2_t[1]; - block1[ 9] = append2_t[2]; - block1[10] = append2_t[3]; - block1[11] = append3_t[0]; - block1[12] = append3_t[1]; - block1[13] = append3_t[2]; - block1[14] = append3_t[3]; - block1[15] = append4_t[0]; - break; - - case 16: block1[ 0] |= append0_t[0]; - block1[ 1] = append0_t[1]; - block1[ 2] = append0_t[2]; - block1[ 3] = append0_t[3]; - block1[ 4] = append1_t[0]; - block1[ 5] = append1_t[1]; - block1[ 6] = append1_t[2]; - block1[ 7] = append1_t[3]; - block1[ 8] = append2_t[0]; - block1[ 9] = append2_t[1]; - block1[10] = append2_t[2]; - block1[11] = append2_t[3]; - block1[12] = append3_t[0]; - block1[13] = append3_t[1]; - block1[14] = append3_t[2]; - block1[15] = append3_t[3]; - break; - - case 17: block1[ 1] |= append0_t[0]; - block1[ 2] = append0_t[1]; - block1[ 3] = append0_t[2]; - block1[ 4] = append0_t[3]; - block1[ 5] = append1_t[0]; - block1[ 6] = append1_t[1]; - block1[ 7] = append1_t[2]; - block1[ 8] = append1_t[3]; - block1[ 9] = append2_t[0]; - block1[10] = append2_t[1]; - block1[11] = append2_t[2]; - block1[12] = append2_t[3]; - block1[13] = append3_t[0]; - block1[14] = append3_t[1]; - block1[15] = append3_t[2]; - break; - - case 18: block1[ 2] |= append0_t[0]; - block1[ 3] = append0_t[1]; - block1[ 4] = append0_t[2]; - block1[ 5] = append0_t[3]; - block1[ 6] = append1_t[0]; - block1[ 7] = append1_t[1]; - block1[ 8] = append1_t[2]; - block1[ 9] = append1_t[3]; - block1[10] = append2_t[0]; - block1[11] = append2_t[1]; - block1[12] = append2_t[2]; - block1[13] = append2_t[3]; - block1[14] = append3_t[0]; - block1[15] = append3_t[1]; - break; - - case 19: block1[ 3] |= append0_t[0]; - block1[ 4] = append0_t[1]; - block1[ 5] = append0_t[2]; - block1[ 6] = append0_t[3]; - block1[ 7] = append1_t[0]; - block1[ 8] = append1_t[1]; - block1[ 9] = append1_t[2]; - block1[10] = append1_t[3]; - block1[11] = append2_t[0]; - block1[12] = append2_t[1]; - block1[13] = append2_t[2]; - block1[14] = append2_t[3]; - block1[15] = append3_t[0]; - break; - - case 20: block1[ 4] |= append0_t[0]; - block1[ 5] = append0_t[1]; - block1[ 6] = append0_t[2]; - block1[ 7] = append0_t[3]; - block1[ 8] = append1_t[0]; - block1[ 9] = append1_t[1]; - block1[10] = append1_t[2]; - block1[11] = append1_t[3]; - block1[12] = append2_t[0]; - block1[13] = append2_t[1]; - block1[14] = append2_t[2]; - block1[15] = append2_t[3]; - break; - - case 21: block1[ 5] |= append0_t[0]; - block1[ 6] = append0_t[1]; - block1[ 7] = append0_t[2]; - block1[ 8] = append0_t[3]; - block1[ 9] = append1_t[0]; - block1[10] = append1_t[1]; - block1[11] = append1_t[2]; - block1[12] = append1_t[3]; - block1[13] = append2_t[0]; - block1[14] = append2_t[1]; - block1[15] = append2_t[2]; - break; - - case 22: block1[ 6] |= append0_t[0]; - block1[ 7] = append0_t[1]; - block1[ 8] = append0_t[2]; - block1[ 9] = append0_t[3]; - block1[10] = append1_t[0]; - block1[11] = append1_t[1]; - block1[12] = append1_t[2]; - block1[13] = append1_t[3]; - block1[14] = append2_t[0]; - block1[15] = append2_t[1]; - break; - - case 23: block1[ 7] |= append0_t[0]; - block1[ 8] = append0_t[1]; - block1[ 9] = append0_t[2]; - block1[10] = append0_t[3]; - block1[11] = append1_t[0]; - block1[12] = append1_t[1]; - block1[13] = append1_t[2]; - block1[14] = append1_t[3]; - block1[15] = append2_t[0]; - break; - - case 24: block1[ 8] |= append0_t[0]; - block1[ 9] = append0_t[1]; - block1[10] = append0_t[2]; - block1[11] = append0_t[3]; - block1[12] = append1_t[0]; - block1[13] = append1_t[1]; - block1[14] = append1_t[2]; - block1[15] = append1_t[3]; - break; - - case 25: block1[ 9] |= append0_t[0]; - block1[10] = append0_t[1]; - block1[11] = append0_t[2]; - block1[12] = append0_t[3]; - block1[13] = append1_t[0]; - block1[14] = append1_t[1]; - block1[15] = append1_t[2]; - break; - - case 26: block1[10] |= append0_t[0]; - block1[11] = append0_t[1]; - block1[12] = append0_t[2]; - block1[13] = append0_t[3]; - block1[14] = append1_t[0]; - block1[15] = append1_t[1]; - break; - - case 27: block1[11] |= append0_t[0]; - block1[12] = append0_t[1]; - block1[13] = append0_t[2]; - block1[14] = append0_t[3]; - block1[15] = append1_t[0]; - break; - - case 28: block1[12] |= append0_t[0]; - block1[13] = append0_t[1]; - block1[14] = append0_t[2]; - block1[15] = append0_t[3]; - break; - - case 29: block1[13] |= append0_t[0]; - block1[14] = append0_t[1]; - block1[15] = append0_t[2]; - break; - - case 30: block1[14] |= append0_t[0]; - block1[15] = append0_t[1]; - break; - } - - u32 new_len = offset + append_len; - - return new_len; -} - -__kernel void m11400_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - const u32 lsz = get_local_size (0); - - /** - * bin2asc table - */ - - __local u32 l_bin2asc[256]; - - for (u32 i = lid; i < 256; i += lsz) - { - const u32 i0 = (i >> 0) & 15; - const u32 i1 = (i >> 4) & 15; - - l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 - | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; - } - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - - /** - * base - */ - - u32 pw_buf0[4]; - u32 pw_buf1[4]; - - pw_buf0[0] = pws[gid].i[ 0]; - pw_buf0[1] = pws[gid].i[ 1]; - pw_buf0[2] = pws[gid].i[ 2]; - pw_buf0[3] = pws[gid].i[ 3]; - pw_buf1[0] = pws[gid].i[ 4]; - pw_buf1[1] = pws[gid].i[ 5]; - pw_buf1[2] = pws[gid].i[ 6]; - pw_buf1[3] = pws[gid].i[ 7]; - - const u32 pw_len = pws[gid].pw_len; - - /** - * salt - */ - - const u32 salt_len = esalt_bufs[digests_offset].salt_len; // not a bug, we need to get it from the esalt - - u32 salt_buf0[16]; - u32 salt_buf1[16]; - - salt_buf0[ 0] = esalt_bufs[digests_offset].salt_buf[ 0]; - salt_buf0[ 1] = esalt_bufs[digests_offset].salt_buf[ 1]; - salt_buf0[ 2] = esalt_bufs[digests_offset].salt_buf[ 2]; - salt_buf0[ 3] = esalt_bufs[digests_offset].salt_buf[ 3]; - salt_buf0[ 4] = esalt_bufs[digests_offset].salt_buf[ 4]; - salt_buf0[ 5] = esalt_bufs[digests_offset].salt_buf[ 5]; - salt_buf0[ 6] = esalt_bufs[digests_offset].salt_buf[ 6]; - salt_buf0[ 7] = esalt_bufs[digests_offset].salt_buf[ 7]; - salt_buf0[ 8] = esalt_bufs[digests_offset].salt_buf[ 8]; - salt_buf0[ 9] = esalt_bufs[digests_offset].salt_buf[ 9]; - salt_buf0[10] = esalt_bufs[digests_offset].salt_buf[10]; - salt_buf0[11] = esalt_bufs[digests_offset].salt_buf[11]; - salt_buf0[12] = esalt_bufs[digests_offset].salt_buf[12]; - salt_buf0[13] = esalt_bufs[digests_offset].salt_buf[13]; - salt_buf0[14] = esalt_bufs[digests_offset].salt_buf[14]; - salt_buf0[15] = esalt_bufs[digests_offset].salt_buf[15]; - salt_buf1[ 0] = esalt_bufs[digests_offset].salt_buf[16]; - salt_buf1[ 1] = esalt_bufs[digests_offset].salt_buf[17]; - salt_buf1[ 2] = esalt_bufs[digests_offset].salt_buf[18]; - salt_buf1[ 3] = esalt_bufs[digests_offset].salt_buf[19]; - salt_buf1[ 4] = esalt_bufs[digests_offset].salt_buf[20]; - salt_buf1[ 5] = esalt_bufs[digests_offset].salt_buf[21]; - salt_buf1[ 6] = esalt_bufs[digests_offset].salt_buf[22]; - salt_buf1[ 7] = esalt_bufs[digests_offset].salt_buf[23]; - salt_buf1[ 8] = esalt_bufs[digests_offset].salt_buf[24]; - salt_buf1[ 9] = esalt_bufs[digests_offset].salt_buf[25]; - salt_buf1[10] = esalt_bufs[digests_offset].salt_buf[26]; - salt_buf1[11] = esalt_bufs[digests_offset].salt_buf[27]; - salt_buf1[12] = esalt_bufs[digests_offset].salt_buf[28]; - salt_buf1[13] = esalt_bufs[digests_offset].salt_buf[29]; - salt_buf1[14] = 0; - salt_buf1[15] = 0; - - /** - * esalt - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - - u32 esalt_buf0[16]; - u32 esalt_buf1[16]; - u32 esalt_buf2[16]; - - esalt_buf0[ 0] = esalt_bufs[digests_offset].esalt_buf[ 0]; - esalt_buf0[ 1] = esalt_bufs[digests_offset].esalt_buf[ 1]; - esalt_buf0[ 2] = esalt_bufs[digests_offset].esalt_buf[ 2]; - esalt_buf0[ 3] = esalt_bufs[digests_offset].esalt_buf[ 3]; - esalt_buf0[ 4] = esalt_bufs[digests_offset].esalt_buf[ 4]; - esalt_buf0[ 5] = esalt_bufs[digests_offset].esalt_buf[ 5]; - esalt_buf0[ 6] = esalt_bufs[digests_offset].esalt_buf[ 6]; - esalt_buf0[ 7] = esalt_bufs[digests_offset].esalt_buf[ 7]; - esalt_buf0[ 8] = esalt_bufs[digests_offset].esalt_buf[ 8]; - esalt_buf0[ 9] = esalt_bufs[digests_offset].esalt_buf[ 9]; - esalt_buf0[10] = esalt_bufs[digests_offset].esalt_buf[10]; - esalt_buf0[11] = esalt_bufs[digests_offset].esalt_buf[11]; - esalt_buf0[12] = esalt_bufs[digests_offset].esalt_buf[12]; - esalt_buf0[13] = esalt_bufs[digests_offset].esalt_buf[13]; - esalt_buf0[14] = esalt_bufs[digests_offset].esalt_buf[14]; - esalt_buf0[15] = esalt_bufs[digests_offset].esalt_buf[15]; - esalt_buf1[ 0] = esalt_bufs[digests_offset].esalt_buf[16]; - esalt_buf1[ 1] = esalt_bufs[digests_offset].esalt_buf[17]; - esalt_buf1[ 2] = esalt_bufs[digests_offset].esalt_buf[18]; - esalt_buf1[ 3] = esalt_bufs[digests_offset].esalt_buf[19]; - esalt_buf1[ 4] = esalt_bufs[digests_offset].esalt_buf[20]; - esalt_buf1[ 5] = esalt_bufs[digests_offset].esalt_buf[21]; - esalt_buf1[ 6] = esalt_bufs[digests_offset].esalt_buf[22]; - esalt_buf1[ 7] = esalt_bufs[digests_offset].esalt_buf[23]; - esalt_buf1[ 8] = esalt_bufs[digests_offset].esalt_buf[24]; - esalt_buf1[ 9] = esalt_bufs[digests_offset].esalt_buf[25]; - esalt_buf1[10] = esalt_bufs[digests_offset].esalt_buf[26]; - esalt_buf1[11] = esalt_bufs[digests_offset].esalt_buf[27]; - esalt_buf1[12] = esalt_bufs[digests_offset].esalt_buf[28]; - esalt_buf1[13] = esalt_bufs[digests_offset].esalt_buf[29]; - esalt_buf1[14] = esalt_bufs[digests_offset].esalt_buf[30]; - esalt_buf1[15] = esalt_bufs[digests_offset].esalt_buf[31]; - esalt_buf2[ 0] = esalt_bufs[digests_offset].esalt_buf[32]; - esalt_buf2[ 1] = esalt_bufs[digests_offset].esalt_buf[33]; - esalt_buf2[ 2] = esalt_bufs[digests_offset].esalt_buf[34]; - esalt_buf2[ 3] = esalt_bufs[digests_offset].esalt_buf[35]; - esalt_buf2[ 4] = esalt_bufs[digests_offset].esalt_buf[36]; - esalt_buf2[ 5] = esalt_bufs[digests_offset].esalt_buf[37]; - esalt_buf2[ 6] = 0; - esalt_buf2[ 7] = 0; - esalt_buf2[ 8] = 0; - esalt_buf2[ 9] = 0; - esalt_buf2[10] = 0; - esalt_buf2[11] = 0; - esalt_buf2[12] = 0; - esalt_buf2[13] = 0; - esalt_buf2[14] = 0; - esalt_buf2[15] = 0; - - const u32 digest_esalt_len = 32 + esalt_len; - const u32 remaining_bytes = digest_esalt_len + 1 - 64; // substract previous block - - /** - * loop - */ - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - u32x w0[4] = { 0 }; - u32x w1[4] = { 0 }; - u32x w2[4] = { 0 }; - u32x w3[4] = { 0 }; - - const u32x out_len = apply_rules_vect (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); - - append_0x80_2x4_VV (w0, w1, out_len); - - const u32x pw_salt_len = salt_len + out_len; - - /* - * HA1 = md5 ($salt . $pass) - */ - - // append the pass to the salt - - u32x block0[16]; - u32x block1[16]; - - block0[ 0] = salt_buf0[ 0]; - block0[ 1] = salt_buf0[ 1]; - block0[ 2] = salt_buf0[ 2]; - block0[ 3] = salt_buf0[ 3]; - block0[ 4] = salt_buf0[ 4]; - block0[ 5] = salt_buf0[ 5]; - block0[ 6] = salt_buf0[ 6]; - block0[ 7] = salt_buf0[ 7]; - block0[ 8] = salt_buf0[ 8]; - block0[ 9] = salt_buf0[ 9]; - block0[10] = salt_buf0[10]; - block0[11] = salt_buf0[11]; - block0[12] = salt_buf0[12]; - block0[13] = salt_buf0[13]; - block0[14] = salt_buf0[14]; - block0[15] = salt_buf0[15]; - block1[ 0] = salt_buf1[ 0]; - block1[ 1] = salt_buf1[ 1]; - block1[ 2] = salt_buf1[ 2]; - block1[ 3] = salt_buf1[ 3]; - block1[ 4] = salt_buf1[ 4]; - block1[ 5] = salt_buf1[ 5]; - block1[ 6] = salt_buf1[ 6]; - block1[ 7] = salt_buf1[ 7]; - block1[ 8] = salt_buf1[ 8]; - block1[ 9] = salt_buf1[ 9]; - block1[10] = salt_buf1[10]; - block1[11] = salt_buf1[11]; - block1[12] = salt_buf1[12]; - block1[13] = salt_buf1[13]; - block1[14] = salt_buf1[14]; - block1[15] = salt_buf1[15]; - - u32 block_len = 0; - - block_len = memcat32 (block0, block1, salt_len, w0, w1, w2, w3, out_len); - - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; - - w0_t[0] = block0[ 0]; - w0_t[1] = block0[ 1]; - w0_t[2] = block0[ 2]; - w0_t[3] = block0[ 3]; - w1_t[0] = block0[ 4]; - w1_t[1] = block0[ 5]; - w1_t[2] = block0[ 6]; - w1_t[3] = block0[ 7]; - w2_t[0] = block0[ 8]; - w2_t[1] = block0[ 9]; - w2_t[2] = block0[10]; - w2_t[3] = block0[11]; - w3_t[0] = block0[12]; - w3_t[1] = block0[13]; - w3_t[2] = block0[14]; - w3_t[3] = block0[15]; - - if (block_len < 56) - { - w3_t[2] = pw_salt_len * 8; - } - - // md5 - - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - if (block_len > 55) - { - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - w0_t[0] = block1[ 0]; - w0_t[1] = block1[ 1]; - w0_t[2] = block1[ 2]; - w0_t[3] = block1[ 3]; - w1_t[0] = block1[ 4]; - w1_t[1] = block1[ 5]; - w1_t[2] = block1[ 6]; - w1_t[3] = block1[ 7]; - w2_t[0] = block1[ 8]; - w2_t[1] = block1[ 9]; - w2_t[2] = block1[10]; - w2_t[3] = block1[11]; - w3_t[0] = block1[12]; - w3_t[1] = block1[13]; - w3_t[2] = pw_salt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - } - - /* - * final = md5 ($HA1 . $esalt) - * we have at least 2 MD5 blocks/transformations, but we might need 3 - */ - - w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - w2_t[0] = esalt_buf0[0]; - w2_t[1] = esalt_buf0[1]; - w2_t[2] = esalt_buf0[2]; - w2_t[3] = esalt_buf0[3]; - w3_t[0] = esalt_buf0[4]; - w3_t[1] = esalt_buf0[5]; - w3_t[2] = esalt_buf0[6]; - w3_t[3] = esalt_buf0[7]; - - // md5 - // 1st transform - - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - // 2nd transform - - w0_t[0] = esalt_buf0[ 8]; - w0_t[1] = esalt_buf0[ 9]; - w0_t[2] = esalt_buf0[10]; - w0_t[3] = esalt_buf0[11]; - w1_t[0] = esalt_buf0[12]; - w1_t[1] = esalt_buf0[13]; - w1_t[2] = esalt_buf0[14]; - w1_t[3] = esalt_buf0[15]; - w2_t[0] = esalt_buf1[ 0]; - w2_t[1] = esalt_buf1[ 1]; - w2_t[2] = esalt_buf1[ 2]; - w2_t[3] = esalt_buf1[ 3]; - w3_t[0] = esalt_buf1[ 4]; - w3_t[1] = esalt_buf1[ 5]; - w3_t[2] = esalt_buf1[ 6]; - w3_t[3] = esalt_buf1[ 7]; - - // it is the final block when no more than 55 bytes left - - if (remaining_bytes < 56) - { - // it is the last block ! - - w3_t[2] = digest_esalt_len * 8; - } - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - // sometimes (not rare at all) we need a third block :( - - if (remaining_bytes > 55) - { - // this is for sure the final block - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - r_a = a; - r_b = b; - r_c = c; - r_d = d; - - w0_t[0] = esalt_buf1[ 8]; - w0_t[1] = esalt_buf1[ 9]; - w0_t[2] = esalt_buf1[10]; - w0_t[3] = esalt_buf1[11]; - w1_t[0] = esalt_buf1[12]; - w1_t[1] = esalt_buf1[13]; - w1_t[2] = esalt_buf1[14]; - w1_t[3] = esalt_buf1[15]; - w2_t[0] = esalt_buf2[ 0]; - w2_t[1] = esalt_buf2[ 1]; - w2_t[2] = esalt_buf2[ 2]; - w2_t[3] = esalt_buf2[ 3]; - w3_t[0] = esalt_buf2[ 4]; - w3_t[1] = esalt_buf2[ 5]; - w3_t[2] = digest_esalt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - } - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - COMPARE_M_SIMD (a, d, c, b); - } -} - -__kernel void m11400_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m11400_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m11400_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - const u32 lsz = get_local_size (0); - - /** - * bin2asc table - */ - - __local u32 l_bin2asc[256]; - - for (u32 i = lid; i < 256; i += lsz) - { - const u32 i0 = (i >> 0) & 15; - const u32 i1 = (i >> 4) & 15; - - l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 - | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; - } - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - - /** - * base - */ - - u32 pw_buf0[4]; - u32 pw_buf1[4]; - - pw_buf0[0] = pws[gid].i[ 0]; - pw_buf0[1] = pws[gid].i[ 1]; - pw_buf0[2] = pws[gid].i[ 2]; - pw_buf0[3] = pws[gid].i[ 3]; - pw_buf1[0] = pws[gid].i[ 4]; - pw_buf1[1] = pws[gid].i[ 5]; - pw_buf1[2] = pws[gid].i[ 6]; - pw_buf1[3] = pws[gid].i[ 7]; - - const u32 pw_len = pws[gid].pw_len; - - /** - * salt - */ - - const u32 salt_len = esalt_bufs[digests_offset].salt_len; // not a bug, we need to get it from the esalt - - u32 salt_buf0[16]; - u32 salt_buf1[16]; - - salt_buf0[ 0] = esalt_bufs[digests_offset].salt_buf[ 0]; - salt_buf0[ 1] = esalt_bufs[digests_offset].salt_buf[ 1]; - salt_buf0[ 2] = esalt_bufs[digests_offset].salt_buf[ 2]; - salt_buf0[ 3] = esalt_bufs[digests_offset].salt_buf[ 3]; - salt_buf0[ 4] = esalt_bufs[digests_offset].salt_buf[ 4]; - salt_buf0[ 5] = esalt_bufs[digests_offset].salt_buf[ 5]; - salt_buf0[ 6] = esalt_bufs[digests_offset].salt_buf[ 6]; - salt_buf0[ 7] = esalt_bufs[digests_offset].salt_buf[ 7]; - salt_buf0[ 8] = esalt_bufs[digests_offset].salt_buf[ 8]; - salt_buf0[ 9] = esalt_bufs[digests_offset].salt_buf[ 9]; - salt_buf0[10] = esalt_bufs[digests_offset].salt_buf[10]; - salt_buf0[11] = esalt_bufs[digests_offset].salt_buf[11]; - salt_buf0[12] = esalt_bufs[digests_offset].salt_buf[12]; - salt_buf0[13] = esalt_bufs[digests_offset].salt_buf[13]; - salt_buf0[14] = esalt_bufs[digests_offset].salt_buf[14]; - salt_buf0[15] = esalt_bufs[digests_offset].salt_buf[15]; - salt_buf1[ 0] = esalt_bufs[digests_offset].salt_buf[16]; - salt_buf1[ 1] = esalt_bufs[digests_offset].salt_buf[17]; - salt_buf1[ 2] = esalt_bufs[digests_offset].salt_buf[18]; - salt_buf1[ 3] = esalt_bufs[digests_offset].salt_buf[19]; - salt_buf1[ 4] = esalt_bufs[digests_offset].salt_buf[20]; - salt_buf1[ 5] = esalt_bufs[digests_offset].salt_buf[21]; - salt_buf1[ 6] = esalt_bufs[digests_offset].salt_buf[22]; - salt_buf1[ 7] = esalt_bufs[digests_offset].salt_buf[23]; - salt_buf1[ 8] = esalt_bufs[digests_offset].salt_buf[24]; - salt_buf1[ 9] = esalt_bufs[digests_offset].salt_buf[25]; - salt_buf1[10] = esalt_bufs[digests_offset].salt_buf[26]; - salt_buf1[11] = esalt_bufs[digests_offset].salt_buf[27]; - salt_buf1[12] = esalt_bufs[digests_offset].salt_buf[28]; - salt_buf1[13] = esalt_bufs[digests_offset].salt_buf[29]; - salt_buf1[14] = 0; - salt_buf1[15] = 0; - - /** - * esalt - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - - u32 esalt_buf0[16]; - u32 esalt_buf1[16]; - u32 esalt_buf2[16]; - - esalt_buf0[ 0] = esalt_bufs[digests_offset].esalt_buf[ 0]; - esalt_buf0[ 1] = esalt_bufs[digests_offset].esalt_buf[ 1]; - esalt_buf0[ 2] = esalt_bufs[digests_offset].esalt_buf[ 2]; - esalt_buf0[ 3] = esalt_bufs[digests_offset].esalt_buf[ 3]; - esalt_buf0[ 4] = esalt_bufs[digests_offset].esalt_buf[ 4]; - esalt_buf0[ 5] = esalt_bufs[digests_offset].esalt_buf[ 5]; - esalt_buf0[ 6] = esalt_bufs[digests_offset].esalt_buf[ 6]; - esalt_buf0[ 7] = esalt_bufs[digests_offset].esalt_buf[ 7]; - esalt_buf0[ 8] = esalt_bufs[digests_offset].esalt_buf[ 8]; - esalt_buf0[ 9] = esalt_bufs[digests_offset].esalt_buf[ 9]; - esalt_buf0[10] = esalt_bufs[digests_offset].esalt_buf[10]; - esalt_buf0[11] = esalt_bufs[digests_offset].esalt_buf[11]; - esalt_buf0[12] = esalt_bufs[digests_offset].esalt_buf[12]; - esalt_buf0[13] = esalt_bufs[digests_offset].esalt_buf[13]; - esalt_buf0[14] = esalt_bufs[digests_offset].esalt_buf[14]; - esalt_buf0[15] = esalt_bufs[digests_offset].esalt_buf[15]; - esalt_buf1[ 0] = esalt_bufs[digests_offset].esalt_buf[16]; - esalt_buf1[ 1] = esalt_bufs[digests_offset].esalt_buf[17]; - esalt_buf1[ 2] = esalt_bufs[digests_offset].esalt_buf[18]; - esalt_buf1[ 3] = esalt_bufs[digests_offset].esalt_buf[19]; - esalt_buf1[ 4] = esalt_bufs[digests_offset].esalt_buf[20]; - esalt_buf1[ 5] = esalt_bufs[digests_offset].esalt_buf[21]; - esalt_buf1[ 6] = esalt_bufs[digests_offset].esalt_buf[22]; - esalt_buf1[ 7] = esalt_bufs[digests_offset].esalt_buf[23]; - esalt_buf1[ 8] = esalt_bufs[digests_offset].esalt_buf[24]; - esalt_buf1[ 9] = esalt_bufs[digests_offset].esalt_buf[25]; - esalt_buf1[10] = esalt_bufs[digests_offset].esalt_buf[26]; - esalt_buf1[11] = esalt_bufs[digests_offset].esalt_buf[27]; - esalt_buf1[12] = esalt_bufs[digests_offset].esalt_buf[28]; - esalt_buf1[13] = esalt_bufs[digests_offset].esalt_buf[29]; - esalt_buf1[14] = esalt_bufs[digests_offset].esalt_buf[30]; - esalt_buf1[15] = esalt_bufs[digests_offset].esalt_buf[31]; - esalt_buf2[ 0] = esalt_bufs[digests_offset].esalt_buf[32]; - esalt_buf2[ 1] = esalt_bufs[digests_offset].esalt_buf[33]; - esalt_buf2[ 2] = esalt_bufs[digests_offset].esalt_buf[34]; - esalt_buf2[ 3] = esalt_bufs[digests_offset].esalt_buf[35]; - esalt_buf2[ 4] = esalt_bufs[digests_offset].esalt_buf[36]; - esalt_buf2[ 5] = esalt_bufs[digests_offset].esalt_buf[37]; - esalt_buf2[ 6] = 0; - esalt_buf2[ 7] = 0; - esalt_buf2[ 8] = 0; - esalt_buf2[ 9] = 0; - esalt_buf2[10] = 0; - esalt_buf2[11] = 0; - esalt_buf2[12] = 0; - esalt_buf2[13] = 0; - esalt_buf2[14] = 0; - esalt_buf2[15] = 0; - - const u32 digest_esalt_len = 32 + esalt_len; - const u32 remaining_bytes = digest_esalt_len + 1 - 64; // substract previous block - - /** - * digest - */ - - const u32 search[4] = - { - digests_buf[digests_offset].digest_buf[DGST_R0], - digests_buf[digests_offset].digest_buf[DGST_R1], - digests_buf[digests_offset].digest_buf[DGST_R2], - digests_buf[digests_offset].digest_buf[DGST_R3] - }; - - /** - * loop - */ - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - u32x w0[4] = { 0 }; - u32x w1[4] = { 0 }; - u32x w2[4] = { 0 }; - u32x w3[4] = { 0 }; - - const u32x out_len = apply_rules_vect (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); - - append_0x80_2x4_VV (w0, w1, out_len); - - const u32x pw_salt_len = salt_len + out_len; - - /* - * HA1 = md5 ($salt . $pass) - */ - - // append the pass to the salt - - u32x block0[16]; - u32x block1[16]; - - block0[ 0] = salt_buf0[ 0]; - block0[ 1] = salt_buf0[ 1]; - block0[ 2] = salt_buf0[ 2]; - block0[ 3] = salt_buf0[ 3]; - block0[ 4] = salt_buf0[ 4]; - block0[ 5] = salt_buf0[ 5]; - block0[ 6] = salt_buf0[ 6]; - block0[ 7] = salt_buf0[ 7]; - block0[ 8] = salt_buf0[ 8]; - block0[ 9] = salt_buf0[ 9]; - block0[10] = salt_buf0[10]; - block0[11] = salt_buf0[11]; - block0[12] = salt_buf0[12]; - block0[13] = salt_buf0[13]; - block0[14] = salt_buf0[14]; - block0[15] = salt_buf0[15]; - block1[ 0] = salt_buf1[ 0]; - block1[ 1] = salt_buf1[ 1]; - block1[ 2] = salt_buf1[ 2]; - block1[ 3] = salt_buf1[ 3]; - block1[ 4] = salt_buf1[ 4]; - block1[ 5] = salt_buf1[ 5]; - block1[ 6] = salt_buf1[ 6]; - block1[ 7] = salt_buf1[ 7]; - block1[ 8] = salt_buf1[ 8]; - block1[ 9] = salt_buf1[ 9]; - block1[10] = salt_buf1[10]; - block1[11] = salt_buf1[11]; - block1[12] = salt_buf1[12]; - block1[13] = salt_buf1[13]; - block1[14] = salt_buf1[14]; - block1[15] = salt_buf1[15]; - - u32 block_len = 0; - - block_len = memcat32 (block0, block1, salt_len, w0, w1, w2, w3, out_len); - - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; - - w0_t[0] = block0[ 0]; - w0_t[1] = block0[ 1]; - w0_t[2] = block0[ 2]; - w0_t[3] = block0[ 3]; - w1_t[0] = block0[ 4]; - w1_t[1] = block0[ 5]; - w1_t[2] = block0[ 6]; - w1_t[3] = block0[ 7]; - w2_t[0] = block0[ 8]; - w2_t[1] = block0[ 9]; - w2_t[2] = block0[10]; - w2_t[3] = block0[11]; - w3_t[0] = block0[12]; - w3_t[1] = block0[13]; - w3_t[2] = block0[14]; - w3_t[3] = block0[15]; - - if (block_len < 56) - { - w3_t[2] = pw_salt_len * 8; - } - - // md5 - - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - if (block_len > 55) - { - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - w0_t[0] = block1[ 0]; - w0_t[1] = block1[ 1]; - w0_t[2] = block1[ 2]; - w0_t[3] = block1[ 3]; - w1_t[0] = block1[ 4]; - w1_t[1] = block1[ 5]; - w1_t[2] = block1[ 6]; - w1_t[3] = block1[ 7]; - w2_t[0] = block1[ 8]; - w2_t[1] = block1[ 9]; - w2_t[2] = block1[10]; - w2_t[3] = block1[11]; - w3_t[0] = block1[12]; - w3_t[1] = block1[13]; - w3_t[2] = pw_salt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - } - - /* - * final = md5 ($HA1 . $esalt) - * we have at least 2 MD5 blocks/transformations, but we might need 3 - */ - - w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - w2_t[0] = esalt_buf0[0]; - w2_t[1] = esalt_buf0[1]; - w2_t[2] = esalt_buf0[2]; - w2_t[3] = esalt_buf0[3]; - w3_t[0] = esalt_buf0[4]; - w3_t[1] = esalt_buf0[5]; - w3_t[2] = esalt_buf0[6]; - w3_t[3] = esalt_buf0[7]; - - // md5 - // 1st transform - - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - // 2nd transform - - w0_t[0] = esalt_buf0[ 8]; - w0_t[1] = esalt_buf0[ 9]; - w0_t[2] = esalt_buf0[10]; - w0_t[3] = esalt_buf0[11]; - w1_t[0] = esalt_buf0[12]; - w1_t[1] = esalt_buf0[13]; - w1_t[2] = esalt_buf0[14]; - w1_t[3] = esalt_buf0[15]; - w2_t[0] = esalt_buf1[ 0]; - w2_t[1] = esalt_buf1[ 1]; - w2_t[2] = esalt_buf1[ 2]; - w2_t[3] = esalt_buf1[ 3]; - w3_t[0] = esalt_buf1[ 4]; - w3_t[1] = esalt_buf1[ 5]; - w3_t[2] = esalt_buf1[ 6]; - w3_t[3] = esalt_buf1[ 7]; - - // it is the final block when no more than 55 bytes left - - if (remaining_bytes < 56) - { - // it is the last block ! - - w3_t[2] = digest_esalt_len * 8; - } - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - // sometimes (not rare at all) we need a third block :( - - if (remaining_bytes > 55) - { - // this is for sure the final block - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - r_a = a; - r_b = b; - r_c = c; - r_d = d; - - w0_t[0] = esalt_buf1[ 8]; - w0_t[1] = esalt_buf1[ 9]; - w0_t[2] = esalt_buf1[10]; - w0_t[3] = esalt_buf1[11]; - w1_t[0] = esalt_buf1[12]; - w1_t[1] = esalt_buf1[13]; - w1_t[2] = esalt_buf1[14]; - w1_t[3] = esalt_buf1[15]; - w2_t[0] = esalt_buf2[ 0]; - w2_t[1] = esalt_buf2[ 1]; - w2_t[2] = esalt_buf2[ 2]; - w2_t[3] = esalt_buf2[ 3]; - w3_t[0] = esalt_buf2[ 4]; - w3_t[1] = esalt_buf2[ 5]; - w3_t[2] = digest_esalt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - } - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - COMPARE_S_SIMD (a, d, c, b); - } -} - -__kernel void m11400_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m11400_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} diff --git a/OpenCL/m11400_a0.cl b/OpenCL/m11400_a0.cl new file mode 100644 index 000000000..f75b2e104 --- /dev/null +++ b/OpenCL/m11400_a0.cl @@ -0,0 +1,254 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m11400_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_ctx_t ctx1 = ctx0; + + md5_update (&ctx1, w, pw_len); + + md5_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + + md5_ctx_t ctx; + + md5_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + + ctx.len = 32; + + md5_update_global (&ctx, esalt_bufs[digests_offset].esalt_buf, esalt_bufs[digests_offset].esalt_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m11400_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md5_ctx_t ctx1 = ctx0; + + md5_update (&ctx1, w, pw_len); + + md5_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + + md5_ctx_t ctx; + + md5_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + + ctx.len = 32; + + md5_update_global (&ctx, esalt_bufs[digests_offset].esalt_buf, esalt_bufs[digests_offset].esalt_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m11400_a1-optimized.cl b/OpenCL/m11400_a1-optimized.cl deleted file mode 100644 index 4df896258..000000000 --- a/OpenCL/m11400_a1-optimized.cl +++ /dev/null @@ -1,2300 +0,0 @@ -/** - * Author......: See docs/credits.txt - * License.....: MIT - */ - -//incompatible because of brances -//#define NEW_SIMD_CODE - -#include "inc_vendor.cl" -#include "inc_hash_constants.h" -#include "inc_hash_functions.cl" -#include "inc_types.cl" -#include "inc_common.cl" -#include "inc_simd.cl" - -#if VECT_SIZE == 1 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) -#elif VECT_SIZE == 2 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) -#elif VECT_SIZE == 4 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) -#elif VECT_SIZE == 8 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) -#elif VECT_SIZE == 16 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) -#endif - -u32 memcat32 (u32x block0[16], u32x block1[16], const u32 offset, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len) -{ - const u32 mod = offset & 3; - const u32 div = offset / 4; - - #if defined IS_AMD || defined IS_GENERIC - const int offset_minus_4 = 4 - mod; - - u32x append00 = swap32 (append0[0]); - u32x append01 = swap32 (append0[1]); - u32x append02 = swap32 (append0[2]); - u32x append03 = swap32 (append0[3]); - u32x append10 = swap32 (append1[0]); - u32x append11 = swap32 (append1[1]); - u32x append12 = swap32 (append1[2]); - u32x append13 = swap32 (append1[3]); - u32x append20 = swap32 (append2[0]); - u32x append21 = swap32 (append2[1]); - u32x append22 = swap32 (append2[2]); - u32x append23 = swap32 (append2[3]); - u32x append30 = swap32 (append3[0]); - u32x append31 = swap32 (append3[1]); - u32x append32 = swap32 (append3[2]); - u32x append33 = swap32 (append3[3]); - - u32x append0_t[4]; - u32x append1_t[4]; - u32x append2_t[4]; - u32x append3_t[4]; - u32x append4_t[4]; - - append0_t[0] = amd_bytealign ( 0, append00, offset); - append0_t[1] = amd_bytealign (append00, append01, offset); - append0_t[2] = amd_bytealign (append01, append02, offset); - append0_t[3] = amd_bytealign (append02, append03, offset); - append1_t[0] = amd_bytealign (append03, append10, offset); - append1_t[1] = amd_bytealign (append10, append11, offset); - append1_t[2] = amd_bytealign (append11, append12, offset); - append1_t[3] = amd_bytealign (append12, append13, offset); - append2_t[0] = amd_bytealign (append13, append20, offset); - append2_t[1] = amd_bytealign (append20, append21, offset); - append2_t[2] = amd_bytealign (append21, append22, offset); - append2_t[3] = amd_bytealign (append22, append23, offset); - append3_t[0] = amd_bytealign (append23, append30, offset); - append3_t[1] = amd_bytealign (append30, append31, offset); - append3_t[2] = amd_bytealign (append31, append32, offset); - append3_t[3] = amd_bytealign (append32, append33, offset); - append4_t[0] = amd_bytealign (append33, 0, offset); - append4_t[1] = 0; - append4_t[2] = 0; - append4_t[3] = 0; - - append0_t[0] = swap32 (append0_t[0]); - append0_t[1] = swap32 (append0_t[1]); - append0_t[2] = swap32 (append0_t[2]); - append0_t[3] = swap32 (append0_t[3]); - append1_t[0] = swap32 (append1_t[0]); - append1_t[1] = swap32 (append1_t[1]); - append1_t[2] = swap32 (append1_t[2]); - append1_t[3] = swap32 (append1_t[3]); - append2_t[0] = swap32 (append2_t[0]); - append2_t[1] = swap32 (append2_t[1]); - append2_t[2] = swap32 (append2_t[2]); - append2_t[3] = swap32 (append2_t[3]); - append3_t[0] = swap32 (append3_t[0]); - append3_t[1] = swap32 (append3_t[1]); - append3_t[2] = swap32 (append3_t[2]); - append3_t[3] = swap32 (append3_t[3]); - append4_t[0] = swap32 (append4_t[0]); - append4_t[1] = swap32 (append4_t[1]); - append4_t[2] = swap32 (append4_t[2]); - append4_t[3] = swap32 (append4_t[3]); - - #endif - - #ifdef IS_NV - - const int offset_minus_4 = 4 - mod; - - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - - u32x append00 = append0[0]; - u32x append01 = append0[1]; - u32x append02 = append0[2]; - u32x append03 = append0[3]; - u32x append10 = append1[0]; - u32x append11 = append1[1]; - u32x append12 = append1[2]; - u32x append13 = append1[3]; - u32x append20 = append2[0]; - u32x append21 = append2[1]; - u32x append22 = append2[2]; - u32x append23 = append2[3]; - u32x append30 = append3[0]; - u32x append31 = append3[1]; - u32x append32 = append3[2]; - u32x append33 = append3[3]; - - u32x append0_t[4]; - u32x append1_t[4]; - u32x append2_t[4]; - u32x append3_t[4]; - u32x append4_t[4]; - - append0_t[0] = __byte_perm ( 0, append00, selector); - append0_t[1] = __byte_perm (append00, append01, selector); - append0_t[2] = __byte_perm (append01, append02, selector); - append0_t[3] = __byte_perm (append02, append03, selector); - append1_t[0] = __byte_perm (append03, append10, selector); - append1_t[1] = __byte_perm (append10, append11, selector); - append1_t[2] = __byte_perm (append11, append12, selector); - append1_t[3] = __byte_perm (append12, append13, selector); - append2_t[0] = __byte_perm (append13, append20, selector); - append2_t[1] = __byte_perm (append20, append21, selector); - append2_t[2] = __byte_perm (append21, append22, selector); - append2_t[3] = __byte_perm (append22, append23, selector); - append3_t[0] = __byte_perm (append23, append30, selector); - append3_t[1] = __byte_perm (append30, append31, selector); - append3_t[2] = __byte_perm (append31, append32, selector); - append3_t[3] = __byte_perm (append32, append33, selector); - append4_t[0] = __byte_perm (append33, 0, selector); - append4_t[1] = 0; - append4_t[2] = 0; - append4_t[3] = 0; - - #endif - - switch (div) - { - case 0: block0[ 0] |= append0_t[0]; - block0[ 1] = append0_t[1]; - block0[ 2] = append0_t[2]; - block0[ 3] = append0_t[3]; - block0[ 4] = append1_t[0]; - block0[ 5] = append1_t[1]; - block0[ 6] = append1_t[2]; - block0[ 7] = append1_t[3]; - block0[ 8] = append2_t[0]; - block0[ 9] = append2_t[1]; - block0[10] = append2_t[2]; - block0[11] = append2_t[3]; - block0[12] = append3_t[0]; - block0[13] = append3_t[1]; - block0[14] = append3_t[2]; - block0[15] = append3_t[3]; - block1[ 0] = append4_t[0]; - block1[ 1] = append4_t[1]; - block1[ 2] = append4_t[2]; - block1[ 3] = append4_t[3]; - break; - - case 1: block0[ 1] |= append0_t[0]; - block0[ 2] = append0_t[1]; - block0[ 3] = append0_t[2]; - block0[ 4] = append0_t[3]; - block0[ 5] = append1_t[0]; - block0[ 6] = append1_t[1]; - block0[ 7] = append1_t[2]; - block0[ 8] = append1_t[3]; - block0[ 9] = append2_t[0]; - block0[10] = append2_t[1]; - block0[11] = append2_t[2]; - block0[12] = append2_t[3]; - block0[13] = append3_t[0]; - block0[14] = append3_t[1]; - block0[15] = append3_t[2]; - block1[ 0] = append3_t[3]; - block1[ 1] = append4_t[0]; - block1[ 2] = append4_t[1]; - block1[ 3] = append4_t[2]; - block1[ 4] = append4_t[3]; - break; - - case 2: block0[ 2] |= append0_t[0]; - block0[ 3] = append0_t[1]; - block0[ 4] = append0_t[2]; - block0[ 5] = append0_t[3]; - block0[ 6] = append1_t[0]; - block0[ 7] = append1_t[1]; - block0[ 8] = append1_t[2]; - block0[ 9] = append1_t[3]; - block0[10] = append2_t[0]; - block0[11] = append2_t[1]; - block0[12] = append2_t[2]; - block0[13] = append2_t[3]; - block0[14] = append3_t[0]; - block0[15] = append3_t[1]; - block1[ 0] = append3_t[2]; - block1[ 1] = append3_t[3]; - block1[ 2] = append4_t[0]; - block1[ 3] = append4_t[1]; - block1[ 4] = append4_t[2]; - block1[ 5] = append4_t[3]; - break; - - case 3: block0[ 3] |= append0_t[0]; - block0[ 4] = append0_t[1]; - block0[ 5] = append0_t[2]; - block0[ 6] = append0_t[3]; - block0[ 7] = append1_t[0]; - block0[ 8] = append1_t[1]; - block0[ 9] = append1_t[2]; - block0[10] = append1_t[3]; - block0[11] = append2_t[0]; - block0[12] = append2_t[1]; - block0[13] = append2_t[2]; - block0[14] = append2_t[3]; - block0[15] = append3_t[0]; - block1[ 0] = append3_t[1]; - block1[ 1] = append3_t[2]; - block1[ 2] = append3_t[3]; - block1[ 3] = append4_t[0]; - block1[ 4] = append4_t[1]; - block1[ 5] = append4_t[2]; - block1[ 6] = append4_t[3]; - break; - - case 4: block0[ 4] |= append0_t[0]; - block0[ 5] = append0_t[1]; - block0[ 6] = append0_t[2]; - block0[ 7] = append0_t[3]; - block0[ 8] = append1_t[0]; - block0[ 9] = append1_t[1]; - block0[10] = append1_t[2]; - block0[11] = append1_t[3]; - block0[12] = append2_t[0]; - block0[13] = append2_t[1]; - block0[14] = append2_t[2]; - block0[15] = append2_t[3]; - block1[ 0] = append3_t[0]; - block1[ 1] = append3_t[1]; - block1[ 2] = append3_t[2]; - block1[ 3] = append3_t[3]; - block1[ 4] = append4_t[0]; - block1[ 5] = append4_t[1]; - block1[ 6] = append4_t[2]; - block1[ 7] = append4_t[3]; - break; - - case 5: block0[ 5] |= append0_t[0]; - block0[ 6] = append0_t[1]; - block0[ 7] = append0_t[2]; - block0[ 8] = append0_t[3]; - block0[ 9] = append1_t[0]; - block0[10] = append1_t[1]; - block0[11] = append1_t[2]; - block0[12] = append1_t[3]; - block0[13] = append2_t[0]; - block0[14] = append2_t[1]; - block0[15] = append2_t[2]; - block1[ 0] = append2_t[3]; - block1[ 1] = append3_t[0]; - block1[ 2] = append3_t[1]; - block1[ 3] = append3_t[2]; - block1[ 4] = append3_t[3]; - block1[ 5] = append4_t[0]; - block1[ 6] = append4_t[1]; - block1[ 7] = append4_t[2]; - block1[ 8] = append4_t[3]; - break; - - case 6: block0[ 6] |= append0_t[0]; - block0[ 7] = append0_t[1]; - block0[ 8] = append0_t[2]; - block0[ 9] = append0_t[3]; - block0[10] = append1_t[0]; - block0[11] = append1_t[1]; - block0[12] = append1_t[2]; - block0[13] = append1_t[3]; - block0[14] = append2_t[0]; - block0[15] = append2_t[1]; - block1[ 0] = append2_t[2]; - block1[ 1] = append2_t[3]; - block1[ 2] = append3_t[0]; - block1[ 3] = append3_t[1]; - block1[ 4] = append3_t[2]; - block1[ 5] = append3_t[3]; - block1[ 6] = append4_t[0]; - block1[ 7] = append4_t[1]; - block1[ 8] = append4_t[2]; - block1[ 9] = append4_t[3]; - break; - - case 7: block0[ 7] |= append0_t[0]; - block0[ 8] = append0_t[1]; - block0[ 9] = append0_t[2]; - block0[10] = append0_t[3]; - block0[11] = append1_t[0]; - block0[12] = append1_t[1]; - block0[13] = append1_t[2]; - block0[14] = append1_t[3]; - block0[15] = append2_t[0]; - block1[ 0] = append2_t[1]; - block1[ 1] = append2_t[2]; - block1[ 2] = append2_t[3]; - block1[ 3] = append3_t[0]; - block1[ 4] = append3_t[1]; - block1[ 5] = append3_t[2]; - block1[ 6] = append3_t[3]; - block1[ 7] = append4_t[0]; - block1[ 8] = append4_t[1]; - block1[ 9] = append4_t[2]; - block1[10] = append4_t[3]; - break; - - case 8: block0[ 8] |= append0_t[0]; - block0[ 9] = append0_t[1]; - block0[10] = append0_t[2]; - block0[11] = append0_t[3]; - block0[12] = append1_t[0]; - block0[13] = append1_t[1]; - block0[14] = append1_t[2]; - block0[15] = append1_t[3]; - block1[ 0] = append2_t[0]; - block1[ 1] = append2_t[1]; - block1[ 2] = append2_t[2]; - block1[ 3] = append2_t[3]; - block1[ 4] = append3_t[0]; - block1[ 5] = append3_t[1]; - block1[ 6] = append3_t[2]; - block1[ 7] = append3_t[3]; - block1[ 8] = append4_t[0]; - block1[ 9] = append4_t[1]; - block1[10] = append4_t[2]; - block1[11] = append4_t[3]; - break; - - case 9: block0[ 9] |= append0_t[0]; - block0[10] = append0_t[1]; - block0[11] = append0_t[2]; - block0[12] = append0_t[3]; - block0[13] = append1_t[0]; - block0[14] = append1_t[1]; - block0[15] = append1_t[2]; - block1[ 0] = append1_t[3]; - block1[ 1] = append2_t[0]; - block1[ 2] = append2_t[1]; - block1[ 3] = append2_t[2]; - block1[ 4] = append2_t[3]; - block1[ 5] = append3_t[0]; - block1[ 6] = append3_t[1]; - block1[ 7] = append3_t[2]; - block1[ 8] = append3_t[3]; - block1[ 9] = append4_t[0]; - block1[10] = append4_t[1]; - block1[11] = append4_t[2]; - block1[12] = append4_t[3]; - break; - - case 10: block0[10] |= append0_t[0]; - block0[11] = append0_t[1]; - block0[12] = append0_t[2]; - block0[13] = append0_t[3]; - block0[14] = append1_t[0]; - block0[15] = append1_t[1]; - block1[ 0] = append1_t[2]; - block1[ 1] = append1_t[3]; - block1[ 2] = append2_t[0]; - block1[ 3] = append2_t[1]; - block1[ 4] = append2_t[2]; - block1[ 5] = append2_t[3]; - block1[ 6] = append3_t[0]; - block1[ 7] = append3_t[1]; - block1[ 8] = append3_t[2]; - block1[ 9] = append3_t[3]; - block1[10] = append4_t[0]; - block1[11] = append4_t[1]; - block1[12] = append4_t[2]; - block1[13] = append4_t[3]; - break; - - case 11: block0[11] |= append0_t[0]; - block0[12] = append0_t[1]; - block0[13] = append0_t[2]; - block0[14] = append0_t[3]; - block0[15] = append1_t[0]; - block1[ 0] = append1_t[1]; - block1[ 1] = append1_t[2]; - block1[ 2] = append1_t[3]; - block1[ 3] = append2_t[0]; - block1[ 4] = append2_t[1]; - block1[ 5] = append2_t[2]; - block1[ 6] = append2_t[3]; - block1[ 7] = append3_t[0]; - block1[ 8] = append3_t[1]; - block1[ 9] = append3_t[2]; - block1[10] = append3_t[3]; - block1[11] = append4_t[0]; - block1[12] = append4_t[1]; - block1[13] = append4_t[2]; - block1[14] = append4_t[3]; - break; - - case 12: block0[12] |= append0_t[0]; - block0[13] = append0_t[1]; - block0[14] = append0_t[2]; - block0[15] = append0_t[3]; - block1[ 0] = append1_t[0]; - block1[ 1] = append1_t[1]; - block1[ 2] = append1_t[2]; - block1[ 3] = append1_t[3]; - block1[ 4] = append2_t[0]; - block1[ 5] = append2_t[1]; - block1[ 6] = append2_t[2]; - block1[ 7] = append2_t[3]; - block1[ 8] = append3_t[0]; - block1[ 9] = append3_t[1]; - block1[10] = append3_t[2]; - block1[11] = append3_t[3]; - block1[12] = append4_t[0]; - block1[13] = append4_t[1]; - block1[14] = append4_t[2]; - block1[15] = append4_t[3]; - break; - - case 13: block0[13] |= append0_t[0]; - block0[14] = append0_t[1]; - block0[15] = append0_t[2]; - block1[ 0] = append0_t[3]; - block1[ 1] = append1_t[0]; - block1[ 2] = append1_t[1]; - block1[ 3] = append1_t[2]; - block1[ 4] = append1_t[3]; - block1[ 5] = append2_t[0]; - block1[ 6] = append2_t[1]; - block1[ 7] = append2_t[2]; - block1[ 8] = append2_t[3]; - block1[ 9] = append3_t[0]; - block1[10] = append3_t[1]; - block1[11] = append3_t[2]; - block1[12] = append3_t[3]; - block1[13] = append4_t[0]; - block1[14] = append4_t[1]; - block1[15] = append4_t[2]; - break; - - case 14: block0[14] |= append0_t[0]; - block0[15] = append0_t[1]; - block1[ 0] = append0_t[2]; - block1[ 1] = append0_t[3]; - block1[ 2] = append1_t[0]; - block1[ 3] = append1_t[1]; - block1[ 4] = append1_t[2]; - block1[ 5] = append1_t[3]; - block1[ 6] = append2_t[0]; - block1[ 7] = append2_t[1]; - block1[ 8] = append2_t[2]; - block1[ 9] = append2_t[3]; - block1[10] = append3_t[0]; - block1[11] = append3_t[1]; - block1[12] = append3_t[2]; - block1[13] = append3_t[3]; - block1[14] = append4_t[0]; - block1[15] = append4_t[1]; - break; - - case 15: block0[15] |= append0_t[0]; - block1[ 0] = append0_t[1]; - block1[ 1] = append0_t[2]; - block1[ 2] = append0_t[3]; - block1[ 3] = append1_t[1]; - block1[ 4] = append1_t[2]; - block1[ 5] = append1_t[3]; - block1[ 6] = append1_t[0]; - block1[ 7] = append2_t[0]; - block1[ 8] = append2_t[1]; - block1[ 9] = append2_t[2]; - block1[10] = append2_t[3]; - block1[11] = append3_t[0]; - block1[12] = append3_t[1]; - block1[13] = append3_t[2]; - block1[14] = append3_t[3]; - block1[15] = append4_t[0]; - break; - - case 16: block1[ 0] |= append0_t[0]; - block1[ 1] = append0_t[1]; - block1[ 2] = append0_t[2]; - block1[ 3] = append0_t[3]; - block1[ 4] = append1_t[0]; - block1[ 5] = append1_t[1]; - block1[ 6] = append1_t[2]; - block1[ 7] = append1_t[3]; - block1[ 8] = append2_t[0]; - block1[ 9] = append2_t[1]; - block1[10] = append2_t[2]; - block1[11] = append2_t[3]; - block1[12] = append3_t[0]; - block1[13] = append3_t[1]; - block1[14] = append3_t[2]; - block1[15] = append3_t[3]; - break; - - case 17: block1[ 1] |= append0_t[0]; - block1[ 2] = append0_t[1]; - block1[ 3] = append0_t[2]; - block1[ 4] = append0_t[3]; - block1[ 5] = append1_t[0]; - block1[ 6] = append1_t[1]; - block1[ 7] = append1_t[2]; - block1[ 8] = append1_t[3]; - block1[ 9] = append2_t[0]; - block1[10] = append2_t[1]; - block1[11] = append2_t[2]; - block1[12] = append2_t[3]; - block1[13] = append3_t[0]; - block1[14] = append3_t[1]; - block1[15] = append3_t[2]; - break; - - case 18: block1[ 2] |= append0_t[0]; - block1[ 3] = append0_t[1]; - block1[ 4] = append0_t[2]; - block1[ 5] = append0_t[3]; - block1[ 6] = append1_t[0]; - block1[ 7] = append1_t[1]; - block1[ 8] = append1_t[2]; - block1[ 9] = append1_t[3]; - block1[10] = append2_t[0]; - block1[11] = append2_t[1]; - block1[12] = append2_t[2]; - block1[13] = append2_t[3]; - block1[14] = append3_t[0]; - block1[15] = append3_t[1]; - break; - - case 19: block1[ 3] |= append0_t[0]; - block1[ 4] = append0_t[1]; - block1[ 5] = append0_t[2]; - block1[ 6] = append0_t[3]; - block1[ 7] = append1_t[0]; - block1[ 8] = append1_t[1]; - block1[ 9] = append1_t[2]; - block1[10] = append1_t[3]; - block1[11] = append2_t[0]; - block1[12] = append2_t[1]; - block1[13] = append2_t[2]; - block1[14] = append2_t[3]; - block1[15] = append3_t[0]; - break; - - case 20: block1[ 4] |= append0_t[0]; - block1[ 5] = append0_t[1]; - block1[ 6] = append0_t[2]; - block1[ 7] = append0_t[3]; - block1[ 8] = append1_t[0]; - block1[ 9] = append1_t[1]; - block1[10] = append1_t[2]; - block1[11] = append1_t[3]; - block1[12] = append2_t[0]; - block1[13] = append2_t[1]; - block1[14] = append2_t[2]; - block1[15] = append2_t[3]; - break; - - case 21: block1[ 5] |= append0_t[0]; - block1[ 6] = append0_t[1]; - block1[ 7] = append0_t[2]; - block1[ 8] = append0_t[3]; - block1[ 9] = append1_t[0]; - block1[10] = append1_t[1]; - block1[11] = append1_t[2]; - block1[12] = append1_t[3]; - block1[13] = append2_t[0]; - block1[14] = append2_t[1]; - block1[15] = append2_t[2]; - break; - - case 22: block1[ 6] |= append0_t[0]; - block1[ 7] = append0_t[1]; - block1[ 8] = append0_t[2]; - block1[ 9] = append0_t[3]; - block1[10] = append1_t[0]; - block1[11] = append1_t[1]; - block1[12] = append1_t[2]; - block1[13] = append1_t[3]; - block1[14] = append2_t[0]; - block1[15] = append2_t[1]; - break; - - case 23: block1[ 7] |= append0_t[0]; - block1[ 8] = append0_t[1]; - block1[ 9] = append0_t[2]; - block1[10] = append0_t[3]; - block1[11] = append1_t[0]; - block1[12] = append1_t[1]; - block1[13] = append1_t[2]; - block1[14] = append1_t[3]; - block1[15] = append2_t[0]; - break; - - case 24: block1[ 8] |= append0_t[0]; - block1[ 9] = append0_t[1]; - block1[10] = append0_t[2]; - block1[11] = append0_t[3]; - block1[12] = append1_t[0]; - block1[13] = append1_t[1]; - block1[14] = append1_t[2]; - block1[15] = append1_t[3]; - break; - - case 25: block1[ 9] |= append0_t[0]; - block1[10] = append0_t[1]; - block1[11] = append0_t[2]; - block1[12] = append0_t[3]; - block1[13] = append1_t[0]; - block1[14] = append1_t[1]; - block1[15] = append1_t[2]; - break; - - case 26: block1[10] |= append0_t[0]; - block1[11] = append0_t[1]; - block1[12] = append0_t[2]; - block1[13] = append0_t[3]; - block1[14] = append1_t[0]; - block1[15] = append1_t[1]; - break; - - case 27: block1[11] |= append0_t[0]; - block1[12] = append0_t[1]; - block1[13] = append0_t[2]; - block1[14] = append0_t[3]; - block1[15] = append1_t[0]; - break; - - case 28: block1[12] |= append0_t[0]; - block1[13] = append0_t[1]; - block1[14] = append0_t[2]; - block1[15] = append0_t[3]; - break; - - case 29: block1[13] |= append0_t[0]; - block1[14] = append0_t[1]; - block1[15] = append0_t[2]; - break; - - case 30: block1[14] |= append0_t[0]; - block1[15] = append0_t[1]; - break; - } - - u32 new_len = offset + append_len; - - return new_len; -} - -__kernel void m11400_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - const u32 lsz = get_local_size (0); - - /** - * bin2asc table - */ - - __local u32 l_bin2asc[256]; - - for (u32 i = lid; i < 256; i += lsz) - { - const u32 i0 = (i >> 0) & 15; - const u32 i1 = (i >> 4) & 15; - - l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 - | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; - } - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - - /** - * base - */ - - u32 pw_buf0[4]; - u32 pw_buf1[4]; - - pw_buf0[0] = pws[gid].i[0]; - pw_buf0[1] = pws[gid].i[1]; - pw_buf0[2] = pws[gid].i[2]; - pw_buf0[3] = pws[gid].i[3]; - pw_buf1[0] = pws[gid].i[4]; - pw_buf1[1] = pws[gid].i[5]; - pw_buf1[2] = pws[gid].i[6]; - pw_buf1[3] = pws[gid].i[7]; - - const u32 pw_l_len = pws[gid].pw_len; - - /** - * salt - */ - - const u32 salt_len = esalt_bufs[digests_offset].salt_len; // not a bug, we need to get it from the esalt - - u32 salt_buf0[16]; - u32 salt_buf1[16]; - - salt_buf0[ 0] = esalt_bufs[digests_offset].salt_buf[ 0]; - salt_buf0[ 1] = esalt_bufs[digests_offset].salt_buf[ 1]; - salt_buf0[ 2] = esalt_bufs[digests_offset].salt_buf[ 2]; - salt_buf0[ 3] = esalt_bufs[digests_offset].salt_buf[ 3]; - salt_buf0[ 4] = esalt_bufs[digests_offset].salt_buf[ 4]; - salt_buf0[ 5] = esalt_bufs[digests_offset].salt_buf[ 5]; - salt_buf0[ 6] = esalt_bufs[digests_offset].salt_buf[ 6]; - salt_buf0[ 7] = esalt_bufs[digests_offset].salt_buf[ 7]; - salt_buf0[ 8] = esalt_bufs[digests_offset].salt_buf[ 8]; - salt_buf0[ 9] = esalt_bufs[digests_offset].salt_buf[ 9]; - salt_buf0[10] = esalt_bufs[digests_offset].salt_buf[10]; - salt_buf0[11] = esalt_bufs[digests_offset].salt_buf[11]; - salt_buf0[12] = esalt_bufs[digests_offset].salt_buf[12]; - salt_buf0[13] = esalt_bufs[digests_offset].salt_buf[13]; - salt_buf0[14] = esalt_bufs[digests_offset].salt_buf[14]; - salt_buf0[15] = esalt_bufs[digests_offset].salt_buf[15]; - salt_buf1[ 0] = esalt_bufs[digests_offset].salt_buf[16]; - salt_buf1[ 1] = esalt_bufs[digests_offset].salt_buf[17]; - salt_buf1[ 2] = esalt_bufs[digests_offset].salt_buf[18]; - salt_buf1[ 3] = esalt_bufs[digests_offset].salt_buf[19]; - salt_buf1[ 4] = esalt_bufs[digests_offset].salt_buf[20]; - salt_buf1[ 5] = esalt_bufs[digests_offset].salt_buf[21]; - salt_buf1[ 6] = esalt_bufs[digests_offset].salt_buf[22]; - salt_buf1[ 7] = esalt_bufs[digests_offset].salt_buf[23]; - salt_buf1[ 8] = esalt_bufs[digests_offset].salt_buf[24]; - salt_buf1[ 9] = esalt_bufs[digests_offset].salt_buf[25]; - salt_buf1[10] = esalt_bufs[digests_offset].salt_buf[26]; - salt_buf1[11] = esalt_bufs[digests_offset].salt_buf[27]; - salt_buf1[12] = esalt_bufs[digests_offset].salt_buf[28]; - salt_buf1[13] = esalt_bufs[digests_offset].salt_buf[29]; - salt_buf1[14] = 0; - salt_buf1[15] = 0; - - /** - * esalt - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - - u32 esalt_buf0[16]; - u32 esalt_buf1[16]; - u32 esalt_buf2[16]; - - esalt_buf0[ 0] = esalt_bufs[digests_offset].esalt_buf[ 0]; - esalt_buf0[ 1] = esalt_bufs[digests_offset].esalt_buf[ 1]; - esalt_buf0[ 2] = esalt_bufs[digests_offset].esalt_buf[ 2]; - esalt_buf0[ 3] = esalt_bufs[digests_offset].esalt_buf[ 3]; - esalt_buf0[ 4] = esalt_bufs[digests_offset].esalt_buf[ 4]; - esalt_buf0[ 5] = esalt_bufs[digests_offset].esalt_buf[ 5]; - esalt_buf0[ 6] = esalt_bufs[digests_offset].esalt_buf[ 6]; - esalt_buf0[ 7] = esalt_bufs[digests_offset].esalt_buf[ 7]; - esalt_buf0[ 8] = esalt_bufs[digests_offset].esalt_buf[ 8]; - esalt_buf0[ 9] = esalt_bufs[digests_offset].esalt_buf[ 9]; - esalt_buf0[10] = esalt_bufs[digests_offset].esalt_buf[10]; - esalt_buf0[11] = esalt_bufs[digests_offset].esalt_buf[11]; - esalt_buf0[12] = esalt_bufs[digests_offset].esalt_buf[12]; - esalt_buf0[13] = esalt_bufs[digests_offset].esalt_buf[13]; - esalt_buf0[14] = esalt_bufs[digests_offset].esalt_buf[14]; - esalt_buf0[15] = esalt_bufs[digests_offset].esalt_buf[15]; - esalt_buf1[ 0] = esalt_bufs[digests_offset].esalt_buf[16]; - esalt_buf1[ 1] = esalt_bufs[digests_offset].esalt_buf[17]; - esalt_buf1[ 2] = esalt_bufs[digests_offset].esalt_buf[18]; - esalt_buf1[ 3] = esalt_bufs[digests_offset].esalt_buf[19]; - esalt_buf1[ 4] = esalt_bufs[digests_offset].esalt_buf[20]; - esalt_buf1[ 5] = esalt_bufs[digests_offset].esalt_buf[21]; - esalt_buf1[ 6] = esalt_bufs[digests_offset].esalt_buf[22]; - esalt_buf1[ 7] = esalt_bufs[digests_offset].esalt_buf[23]; - esalt_buf1[ 8] = esalt_bufs[digests_offset].esalt_buf[24]; - esalt_buf1[ 9] = esalt_bufs[digests_offset].esalt_buf[25]; - esalt_buf1[10] = esalt_bufs[digests_offset].esalt_buf[26]; - esalt_buf1[11] = esalt_bufs[digests_offset].esalt_buf[27]; - esalt_buf1[12] = esalt_bufs[digests_offset].esalt_buf[28]; - esalt_buf1[13] = esalt_bufs[digests_offset].esalt_buf[29]; - esalt_buf1[14] = esalt_bufs[digests_offset].esalt_buf[30]; - esalt_buf1[15] = esalt_bufs[digests_offset].esalt_buf[31]; - esalt_buf2[ 0] = esalt_bufs[digests_offset].esalt_buf[32]; - esalt_buf2[ 1] = esalt_bufs[digests_offset].esalt_buf[33]; - esalt_buf2[ 2] = esalt_bufs[digests_offset].esalt_buf[34]; - esalt_buf2[ 3] = esalt_bufs[digests_offset].esalt_buf[35]; - esalt_buf2[ 4] = esalt_bufs[digests_offset].esalt_buf[36]; - esalt_buf2[ 5] = esalt_bufs[digests_offset].esalt_buf[37]; - esalt_buf2[ 6] = 0; - esalt_buf2[ 7] = 0; - esalt_buf2[ 8] = 0; - esalt_buf2[ 9] = 0; - esalt_buf2[10] = 0; - esalt_buf2[11] = 0; - esalt_buf2[12] = 0; - esalt_buf2[13] = 0; - esalt_buf2[14] = 0; - esalt_buf2[15] = 0; - - const u32 digest_esalt_len = 32 + esalt_len; - const u32 remaining_bytes = digest_esalt_len + 1 - 64; // substract previous block - - /** - * loop - */ - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); - - const u32x pw_len = pw_l_len + pw_r_len; - - /** - * concat password candidate - */ - - u32x wordl0[4] = { 0 }; - u32x wordl1[4] = { 0 }; - u32x wordl2[4] = { 0 }; - u32x wordl3[4] = { 0 }; - - wordl0[0] = pw_buf0[0]; - wordl0[1] = pw_buf0[1]; - wordl0[2] = pw_buf0[2]; - wordl0[3] = pw_buf0[3]; - wordl1[0] = pw_buf1[0]; - wordl1[1] = pw_buf1[1]; - wordl1[2] = pw_buf1[2]; - wordl1[3] = pw_buf1[3]; - - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; - - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); - - if (combs_mode == COMBINATOR_MODE_BASE_LEFT) - { - switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len); - } - else - { - switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len); - } - - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; - - w0[0] = wordl0[0] | wordr0[0]; - w0[1] = wordl0[1] | wordr0[1]; - w0[2] = wordl0[2] | wordr0[2]; - w0[3] = wordl0[3] | wordr0[3]; - w1[0] = wordl1[0] | wordr1[0]; - w1[1] = wordl1[1] | wordr1[1]; - w1[2] = wordl1[2] | wordr1[2]; - w1[3] = wordl1[3] | wordr1[3]; - w2[0] = wordl2[0] | wordr2[0]; - w2[1] = wordl2[1] | wordr2[1]; - w2[2] = wordl2[2] | wordr2[2]; - w2[3] = wordl2[3] | wordr2[3]; - w3[0] = wordl3[0] | wordr3[0]; - w3[1] = wordl3[1] | wordr3[1]; - w3[2] = wordl3[2] | wordr3[2]; - w3[3] = wordl3[3] | wordr3[3]; - - const u32x pw_salt_len = salt_len + pw_len; - - /* - * HA1 = md5 ($salt . $pass) - */ - - // append the pass to the salt - - u32x block0[16]; - u32x block1[16]; - - block0[ 0] = salt_buf0[ 0]; - block0[ 1] = salt_buf0[ 1]; - block0[ 2] = salt_buf0[ 2]; - block0[ 3] = salt_buf0[ 3]; - block0[ 4] = salt_buf0[ 4]; - block0[ 5] = salt_buf0[ 5]; - block0[ 6] = salt_buf0[ 6]; - block0[ 7] = salt_buf0[ 7]; - block0[ 8] = salt_buf0[ 8]; - block0[ 9] = salt_buf0[ 9]; - block0[10] = salt_buf0[10]; - block0[11] = salt_buf0[11]; - block0[12] = salt_buf0[12]; - block0[13] = salt_buf0[13]; - block0[14] = salt_buf0[14]; - block0[15] = salt_buf0[15]; - block1[ 0] = salt_buf1[ 0]; - block1[ 1] = salt_buf1[ 1]; - block1[ 2] = salt_buf1[ 2]; - block1[ 3] = salt_buf1[ 3]; - block1[ 4] = salt_buf1[ 4]; - block1[ 5] = salt_buf1[ 5]; - block1[ 6] = salt_buf1[ 6]; - block1[ 7] = salt_buf1[ 7]; - block1[ 8] = salt_buf1[ 8]; - block1[ 9] = salt_buf1[ 9]; - block1[10] = salt_buf1[10]; - block1[11] = salt_buf1[11]; - block1[12] = salt_buf1[12]; - block1[13] = salt_buf1[13]; - block1[14] = salt_buf1[14]; - block1[15] = salt_buf1[15]; - - u32 block_len = 0; - - block_len = memcat32 (block0, block1, salt_len, w0, w1, w2, w3, pw_len); - - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; - - w0_t[0] = block0[ 0]; - w0_t[1] = block0[ 1]; - w0_t[2] = block0[ 2]; - w0_t[3] = block0[ 3]; - w1_t[0] = block0[ 4]; - w1_t[1] = block0[ 5]; - w1_t[2] = block0[ 6]; - w1_t[3] = block0[ 7]; - w2_t[0] = block0[ 8]; - w2_t[1] = block0[ 9]; - w2_t[2] = block0[10]; - w2_t[3] = block0[11]; - w3_t[0] = block0[12]; - w3_t[1] = block0[13]; - w3_t[2] = block0[14]; - w3_t[3] = block0[15]; - - if (block_len < 56) - { - w3_t[2] = pw_salt_len * 8; - } - - // md5 - - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - if (block_len > 55) - { - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - w0_t[0] = block1[ 0]; - w0_t[1] = block1[ 1]; - w0_t[2] = block1[ 2]; - w0_t[3] = block1[ 3]; - w1_t[0] = block1[ 4]; - w1_t[1] = block1[ 5]; - w1_t[2] = block1[ 6]; - w1_t[3] = block1[ 7]; - w2_t[0] = block1[ 8]; - w2_t[1] = block1[ 9]; - w2_t[2] = block1[10]; - w2_t[3] = block1[11]; - w3_t[0] = block1[12]; - w3_t[1] = block1[13]; - w3_t[2] = pw_salt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - } - - /* - * final = md5 ($HA1 . $esalt) - * we have at least 2 MD5 blocks/transformations, but we might need 3 - */ - - w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - w2_t[0] = esalt_buf0[0]; - w2_t[1] = esalt_buf0[1]; - w2_t[2] = esalt_buf0[2]; - w2_t[3] = esalt_buf0[3]; - w3_t[0] = esalt_buf0[4]; - w3_t[1] = esalt_buf0[5]; - w3_t[2] = esalt_buf0[6]; - w3_t[3] = esalt_buf0[7]; - - // md5 - // 1st transform - - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - // 2nd transform - - w0_t[0] = esalt_buf0[ 8]; - w0_t[1] = esalt_buf0[ 9]; - w0_t[2] = esalt_buf0[10]; - w0_t[3] = esalt_buf0[11]; - w1_t[0] = esalt_buf0[12]; - w1_t[1] = esalt_buf0[13]; - w1_t[2] = esalt_buf0[14]; - w1_t[3] = esalt_buf0[15]; - w2_t[0] = esalt_buf1[ 0]; - w2_t[1] = esalt_buf1[ 1]; - w2_t[2] = esalt_buf1[ 2]; - w2_t[3] = esalt_buf1[ 3]; - w3_t[0] = esalt_buf1[ 4]; - w3_t[1] = esalt_buf1[ 5]; - w3_t[2] = esalt_buf1[ 6]; - w3_t[3] = esalt_buf1[ 7]; - - // it is the final block when no more than 55 bytes left - - if (remaining_bytes < 56) - { - // it is the last block ! - - w3_t[2] = digest_esalt_len * 8; - } - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - // sometimes (not rare at all) we need a third block :( - - if (remaining_bytes > 55) - { - // this is for sure the final block - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - r_a = a; - r_b = b; - r_c = c; - r_d = d; - - w0_t[0] = esalt_buf1[ 8]; - w0_t[1] = esalt_buf1[ 9]; - w0_t[2] = esalt_buf1[10]; - w0_t[3] = esalt_buf1[11]; - w1_t[0] = esalt_buf1[12]; - w1_t[1] = esalt_buf1[13]; - w1_t[2] = esalt_buf1[14]; - w1_t[3] = esalt_buf1[15]; - w2_t[0] = esalt_buf2[ 0]; - w2_t[1] = esalt_buf2[ 1]; - w2_t[2] = esalt_buf2[ 2]; - w2_t[3] = esalt_buf2[ 3]; - w3_t[0] = esalt_buf2[ 4]; - w3_t[1] = esalt_buf2[ 5]; - w3_t[2] = digest_esalt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - } - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - COMPARE_M_SIMD (a, d, c, b); - } -} - -__kernel void m11400_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m11400_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m11400_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - const u32 lsz = get_local_size (0); - - /** - * bin2asc table - */ - - __local u32 l_bin2asc[256]; - - for (u32 i = lid; i < 256; i += lsz) - { - const u32 i0 = (i >> 0) & 15; - const u32 i1 = (i >> 4) & 15; - - l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 - | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; - } - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - - /** - * base - */ - - u32 pw_buf0[4]; - u32 pw_buf1[4]; - - pw_buf0[0] = pws[gid].i[0]; - pw_buf0[1] = pws[gid].i[1]; - pw_buf0[2] = pws[gid].i[2]; - pw_buf0[3] = pws[gid].i[3]; - pw_buf1[0] = pws[gid].i[4]; - pw_buf1[1] = pws[gid].i[5]; - pw_buf1[2] = pws[gid].i[6]; - pw_buf1[3] = pws[gid].i[7]; - - const u32 pw_l_len = pws[gid].pw_len; - - /** - * salt - */ - - const u32 salt_len = esalt_bufs[digests_offset].salt_len; // not a bug, we need to get it from the esalt - - u32 salt_buf0[16]; - u32 salt_buf1[16]; - - salt_buf0[ 0] = esalt_bufs[digests_offset].salt_buf[ 0]; - salt_buf0[ 1] = esalt_bufs[digests_offset].salt_buf[ 1]; - salt_buf0[ 2] = esalt_bufs[digests_offset].salt_buf[ 2]; - salt_buf0[ 3] = esalt_bufs[digests_offset].salt_buf[ 3]; - salt_buf0[ 4] = esalt_bufs[digests_offset].salt_buf[ 4]; - salt_buf0[ 5] = esalt_bufs[digests_offset].salt_buf[ 5]; - salt_buf0[ 6] = esalt_bufs[digests_offset].salt_buf[ 6]; - salt_buf0[ 7] = esalt_bufs[digests_offset].salt_buf[ 7]; - salt_buf0[ 8] = esalt_bufs[digests_offset].salt_buf[ 8]; - salt_buf0[ 9] = esalt_bufs[digests_offset].salt_buf[ 9]; - salt_buf0[10] = esalt_bufs[digests_offset].salt_buf[10]; - salt_buf0[11] = esalt_bufs[digests_offset].salt_buf[11]; - salt_buf0[12] = esalt_bufs[digests_offset].salt_buf[12]; - salt_buf0[13] = esalt_bufs[digests_offset].salt_buf[13]; - salt_buf0[14] = esalt_bufs[digests_offset].salt_buf[14]; - salt_buf0[15] = esalt_bufs[digests_offset].salt_buf[15]; - salt_buf1[ 0] = esalt_bufs[digests_offset].salt_buf[16]; - salt_buf1[ 1] = esalt_bufs[digests_offset].salt_buf[17]; - salt_buf1[ 2] = esalt_bufs[digests_offset].salt_buf[18]; - salt_buf1[ 3] = esalt_bufs[digests_offset].salt_buf[19]; - salt_buf1[ 4] = esalt_bufs[digests_offset].salt_buf[20]; - salt_buf1[ 5] = esalt_bufs[digests_offset].salt_buf[21]; - salt_buf1[ 6] = esalt_bufs[digests_offset].salt_buf[22]; - salt_buf1[ 7] = esalt_bufs[digests_offset].salt_buf[23]; - salt_buf1[ 8] = esalt_bufs[digests_offset].salt_buf[24]; - salt_buf1[ 9] = esalt_bufs[digests_offset].salt_buf[25]; - salt_buf1[10] = esalt_bufs[digests_offset].salt_buf[26]; - salt_buf1[11] = esalt_bufs[digests_offset].salt_buf[27]; - salt_buf1[12] = esalt_bufs[digests_offset].salt_buf[28]; - salt_buf1[13] = esalt_bufs[digests_offset].salt_buf[29]; - salt_buf1[14] = 0; - salt_buf1[15] = 0; - - /** - * esalt - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - - u32 esalt_buf0[16]; - u32 esalt_buf1[16]; - u32 esalt_buf2[16]; - - esalt_buf0[ 0] = esalt_bufs[digests_offset].esalt_buf[ 0]; - esalt_buf0[ 1] = esalt_bufs[digests_offset].esalt_buf[ 1]; - esalt_buf0[ 2] = esalt_bufs[digests_offset].esalt_buf[ 2]; - esalt_buf0[ 3] = esalt_bufs[digests_offset].esalt_buf[ 3]; - esalt_buf0[ 4] = esalt_bufs[digests_offset].esalt_buf[ 4]; - esalt_buf0[ 5] = esalt_bufs[digests_offset].esalt_buf[ 5]; - esalt_buf0[ 6] = esalt_bufs[digests_offset].esalt_buf[ 6]; - esalt_buf0[ 7] = esalt_bufs[digests_offset].esalt_buf[ 7]; - esalt_buf0[ 8] = esalt_bufs[digests_offset].esalt_buf[ 8]; - esalt_buf0[ 9] = esalt_bufs[digests_offset].esalt_buf[ 9]; - esalt_buf0[10] = esalt_bufs[digests_offset].esalt_buf[10]; - esalt_buf0[11] = esalt_bufs[digests_offset].esalt_buf[11]; - esalt_buf0[12] = esalt_bufs[digests_offset].esalt_buf[12]; - esalt_buf0[13] = esalt_bufs[digests_offset].esalt_buf[13]; - esalt_buf0[14] = esalt_bufs[digests_offset].esalt_buf[14]; - esalt_buf0[15] = esalt_bufs[digests_offset].esalt_buf[15]; - esalt_buf1[ 0] = esalt_bufs[digests_offset].esalt_buf[16]; - esalt_buf1[ 1] = esalt_bufs[digests_offset].esalt_buf[17]; - esalt_buf1[ 2] = esalt_bufs[digests_offset].esalt_buf[18]; - esalt_buf1[ 3] = esalt_bufs[digests_offset].esalt_buf[19]; - esalt_buf1[ 4] = esalt_bufs[digests_offset].esalt_buf[20]; - esalt_buf1[ 5] = esalt_bufs[digests_offset].esalt_buf[21]; - esalt_buf1[ 6] = esalt_bufs[digests_offset].esalt_buf[22]; - esalt_buf1[ 7] = esalt_bufs[digests_offset].esalt_buf[23]; - esalt_buf1[ 8] = esalt_bufs[digests_offset].esalt_buf[24]; - esalt_buf1[ 9] = esalt_bufs[digests_offset].esalt_buf[25]; - esalt_buf1[10] = esalt_bufs[digests_offset].esalt_buf[26]; - esalt_buf1[11] = esalt_bufs[digests_offset].esalt_buf[27]; - esalt_buf1[12] = esalt_bufs[digests_offset].esalt_buf[28]; - esalt_buf1[13] = esalt_bufs[digests_offset].esalt_buf[29]; - esalt_buf1[14] = esalt_bufs[digests_offset].esalt_buf[30]; - esalt_buf1[15] = esalt_bufs[digests_offset].esalt_buf[31]; - esalt_buf2[ 0] = esalt_bufs[digests_offset].esalt_buf[32]; - esalt_buf2[ 1] = esalt_bufs[digests_offset].esalt_buf[33]; - esalt_buf2[ 2] = esalt_bufs[digests_offset].esalt_buf[34]; - esalt_buf2[ 3] = esalt_bufs[digests_offset].esalt_buf[35]; - esalt_buf2[ 4] = esalt_bufs[digests_offset].esalt_buf[36]; - esalt_buf2[ 5] = esalt_bufs[digests_offset].esalt_buf[37]; - esalt_buf2[ 6] = 0; - esalt_buf2[ 7] = 0; - esalt_buf2[ 8] = 0; - esalt_buf2[ 9] = 0; - esalt_buf2[10] = 0; - esalt_buf2[11] = 0; - esalt_buf2[12] = 0; - esalt_buf2[13] = 0; - esalt_buf2[14] = 0; - esalt_buf2[15] = 0; - - const u32 digest_esalt_len = 32 + esalt_len; - const u32 remaining_bytes = digest_esalt_len + 1 - 64; // substract previous block - - /** - * digest - */ - - const u32 search[4] = - { - digests_buf[digests_offset].digest_buf[DGST_R0], - digests_buf[digests_offset].digest_buf[DGST_R1], - digests_buf[digests_offset].digest_buf[DGST_R2], - digests_buf[digests_offset].digest_buf[DGST_R3] - }; - - /** - * loop - */ - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); - - const u32x pw_len = pw_l_len + pw_r_len; - - /** - * concat password candidate - */ - - u32x wordl0[4] = { 0 }; - u32x wordl1[4] = { 0 }; - u32x wordl2[4] = { 0 }; - u32x wordl3[4] = { 0 }; - - wordl0[0] = pw_buf0[0]; - wordl0[1] = pw_buf0[1]; - wordl0[2] = pw_buf0[2]; - wordl0[3] = pw_buf0[3]; - wordl1[0] = pw_buf1[0]; - wordl1[1] = pw_buf1[1]; - wordl1[2] = pw_buf1[2]; - wordl1[3] = pw_buf1[3]; - - u32x wordr0[4] = { 0 }; - u32x wordr1[4] = { 0 }; - u32x wordr2[4] = { 0 }; - u32x wordr3[4] = { 0 }; - - wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); - wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); - wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); - wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); - wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); - wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); - wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); - wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); - - if (combs_mode == COMBINATOR_MODE_BASE_LEFT) - { - switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len); - } - else - { - switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len); - } - - u32x w0[4]; - u32x w1[4]; - u32x w2[4]; - u32x w3[4]; - - w0[0] = wordl0[0] | wordr0[0]; - w0[1] = wordl0[1] | wordr0[1]; - w0[2] = wordl0[2] | wordr0[2]; - w0[3] = wordl0[3] | wordr0[3]; - w1[0] = wordl1[0] | wordr1[0]; - w1[1] = wordl1[1] | wordr1[1]; - w1[2] = wordl1[2] | wordr1[2]; - w1[3] = wordl1[3] | wordr1[3]; - w2[0] = wordl2[0] | wordr2[0]; - w2[1] = wordl2[1] | wordr2[1]; - w2[2] = wordl2[2] | wordr2[2]; - w2[3] = wordl2[3] | wordr2[3]; - w3[0] = wordl3[0] | wordr3[0]; - w3[1] = wordl3[1] | wordr3[1]; - w3[2] = wordl3[2] | wordr3[2]; - w3[3] = wordl3[3] | wordr3[3]; - - const u32x pw_salt_len = salt_len + pw_len; - - /* - * HA1 = md5 ($salt . $pass) - */ - - // append the pass to the salt - - u32x block0[16]; - u32x block1[16]; - - block0[ 0] = salt_buf0[ 0]; - block0[ 1] = salt_buf0[ 1]; - block0[ 2] = salt_buf0[ 2]; - block0[ 3] = salt_buf0[ 3]; - block0[ 4] = salt_buf0[ 4]; - block0[ 5] = salt_buf0[ 5]; - block0[ 6] = salt_buf0[ 6]; - block0[ 7] = salt_buf0[ 7]; - block0[ 8] = salt_buf0[ 8]; - block0[ 9] = salt_buf0[ 9]; - block0[10] = salt_buf0[10]; - block0[11] = salt_buf0[11]; - block0[12] = salt_buf0[12]; - block0[13] = salt_buf0[13]; - block0[14] = salt_buf0[14]; - block0[15] = salt_buf0[15]; - block1[ 0] = salt_buf1[ 0]; - block1[ 1] = salt_buf1[ 1]; - block1[ 2] = salt_buf1[ 2]; - block1[ 3] = salt_buf1[ 3]; - block1[ 4] = salt_buf1[ 4]; - block1[ 5] = salt_buf1[ 5]; - block1[ 6] = salt_buf1[ 6]; - block1[ 7] = salt_buf1[ 7]; - block1[ 8] = salt_buf1[ 8]; - block1[ 9] = salt_buf1[ 9]; - block1[10] = salt_buf1[10]; - block1[11] = salt_buf1[11]; - block1[12] = salt_buf1[12]; - block1[13] = salt_buf1[13]; - block1[14] = salt_buf1[14]; - block1[15] = salt_buf1[15]; - - u32 block_len = 0; - - block_len = memcat32 (block0, block1, salt_len, w0, w1, w2, w3, pw_len); - - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; - - w0_t[0] = block0[ 0]; - w0_t[1] = block0[ 1]; - w0_t[2] = block0[ 2]; - w0_t[3] = block0[ 3]; - w1_t[0] = block0[ 4]; - w1_t[1] = block0[ 5]; - w1_t[2] = block0[ 6]; - w1_t[3] = block0[ 7]; - w2_t[0] = block0[ 8]; - w2_t[1] = block0[ 9]; - w2_t[2] = block0[10]; - w2_t[3] = block0[11]; - w3_t[0] = block0[12]; - w3_t[1] = block0[13]; - w3_t[2] = block0[14]; - w3_t[3] = block0[15]; - - if (block_len < 56) - { - w3_t[2] = pw_salt_len * 8; - } - - // md5 - - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - if (block_len > 55) - { - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - w0_t[0] = block1[ 0]; - w0_t[1] = block1[ 1]; - w0_t[2] = block1[ 2]; - w0_t[3] = block1[ 3]; - w1_t[0] = block1[ 4]; - w1_t[1] = block1[ 5]; - w1_t[2] = block1[ 6]; - w1_t[3] = block1[ 7]; - w2_t[0] = block1[ 8]; - w2_t[1] = block1[ 9]; - w2_t[2] = block1[10]; - w2_t[3] = block1[11]; - w3_t[0] = block1[12]; - w3_t[1] = block1[13]; - w3_t[2] = pw_salt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - } - - /* - * final = md5 ($HA1 . $esalt) - * we have at least 2 MD5 blocks/transformations, but we might need 3 - */ - - w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - w2_t[0] = esalt_buf0[0]; - w2_t[1] = esalt_buf0[1]; - w2_t[2] = esalt_buf0[2]; - w2_t[3] = esalt_buf0[3]; - w3_t[0] = esalt_buf0[4]; - w3_t[1] = esalt_buf0[5]; - w3_t[2] = esalt_buf0[6]; - w3_t[3] = esalt_buf0[7]; - - // md5 - // 1st transform - - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - // 2nd transform - - w0_t[0] = esalt_buf0[ 8]; - w0_t[1] = esalt_buf0[ 9]; - w0_t[2] = esalt_buf0[10]; - w0_t[3] = esalt_buf0[11]; - w1_t[0] = esalt_buf0[12]; - w1_t[1] = esalt_buf0[13]; - w1_t[2] = esalt_buf0[14]; - w1_t[3] = esalt_buf0[15]; - w2_t[0] = esalt_buf1[ 0]; - w2_t[1] = esalt_buf1[ 1]; - w2_t[2] = esalt_buf1[ 2]; - w2_t[3] = esalt_buf1[ 3]; - w3_t[0] = esalt_buf1[ 4]; - w3_t[1] = esalt_buf1[ 5]; - w3_t[2] = esalt_buf1[ 6]; - w3_t[3] = esalt_buf1[ 7]; - - // it is the final block when no more than 55 bytes left - - if (remaining_bytes < 56) - { - // it is the last block ! - - w3_t[2] = digest_esalt_len * 8; - } - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - // sometimes (not rare at all) we need a third block :( - - if (remaining_bytes > 55) - { - // this is for sure the final block - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - r_a = a; - r_b = b; - r_c = c; - r_d = d; - - w0_t[0] = esalt_buf1[ 8]; - w0_t[1] = esalt_buf1[ 9]; - w0_t[2] = esalt_buf1[10]; - w0_t[3] = esalt_buf1[11]; - w1_t[0] = esalt_buf1[12]; - w1_t[1] = esalt_buf1[13]; - w1_t[2] = esalt_buf1[14]; - w1_t[3] = esalt_buf1[15]; - w2_t[0] = esalt_buf2[ 0]; - w2_t[1] = esalt_buf2[ 1]; - w2_t[2] = esalt_buf2[ 2]; - w2_t[3] = esalt_buf2[ 3]; - w3_t[0] = esalt_buf2[ 4]; - w3_t[1] = esalt_buf2[ 5]; - w3_t[2] = digest_esalt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - } - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - COMPARE_S_SIMD (a, d, c, b); - } -} - -__kernel void m11400_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} - -__kernel void m11400_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ -} diff --git a/OpenCL/m11400_a1.cl b/OpenCL/m11400_a1.cl new file mode 100644 index 000000000..67b13c3bc --- /dev/null +++ b/OpenCL/m11400_a1.cl @@ -0,0 +1,226 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_md5.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m11400_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, esalt_bufs[digests_offset].salt_buf, esalt_bufs[digests_offset].salt_len); + + md5_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md5_ctx_t ctx1 = ctx0; + + md5_update_global (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md5_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + + md5_ctx_t ctx; + + md5_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + + ctx.len = 32; + + md5_update_global (&ctx, esalt_bufs[digests_offset].esalt_buf, esalt_bufs[digests_offset].esalt_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m11400_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, esalt_bufs[digests_offset].salt_buf, esalt_bufs[digests_offset].salt_len); + + md5_update_global (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md5_ctx_t ctx1 = ctx0; + + md5_update_global (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md5_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + + md5_ctx_t ctx; + + md5_init (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + + ctx.len = 32; + + md5_update_global (&ctx, esalt_bufs[digests_offset].esalt_buf, esalt_bufs[digests_offset].esalt_len); + + md5_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m11400_a3-optimized.cl b/OpenCL/m11400_a3-optimized.cl deleted file mode 100644 index 4e883375d..000000000 --- a/OpenCL/m11400_a3-optimized.cl +++ /dev/null @@ -1,5968 +0,0 @@ -/** - * Author......: See docs/credits.txt - * License.....: MIT - */ - -//incompatible because of brances -//#define NEW_SIMD_CODE - -#include "inc_vendor.cl" -#include "inc_hash_constants.h" -#include "inc_hash_functions.cl" -#include "inc_types.cl" -#include "inc_common.cl" -#include "inc_simd.cl" - -#if VECT_SIZE == 1 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) -#elif VECT_SIZE == 2 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) -#elif VECT_SIZE == 4 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) -#elif VECT_SIZE == 8 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) -#elif VECT_SIZE == 16 -#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) -#endif - -u32 memcat32 (u32x block0[16], u32x block1[16], const u32 offset, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len) -{ - const u32 mod = offset & 3; - const u32 div = offset / 4; - - #if defined IS_AMD || defined IS_GENERIC - const int offset_minus_4 = 4 - mod; - - u32x append00 = swap32 (append0[0]); - u32x append01 = swap32 (append0[1]); - u32x append02 = swap32 (append0[2]); - u32x append03 = swap32 (append0[3]); - u32x append10 = swap32 (append1[0]); - u32x append11 = swap32 (append1[1]); - u32x append12 = swap32 (append1[2]); - u32x append13 = swap32 (append1[3]); - u32x append20 = swap32 (append2[0]); - u32x append21 = swap32 (append2[1]); - u32x append22 = swap32 (append2[2]); - u32x append23 = swap32 (append2[3]); - u32x append30 = swap32 (append3[0]); - u32x append31 = swap32 (append3[1]); - u32x append32 = swap32 (append3[2]); - u32x append33 = swap32 (append3[3]); - - u32x append0_t[4]; - u32x append1_t[4]; - u32x append2_t[4]; - u32x append3_t[4]; - u32x append4_t[4]; - - append0_t[0] = amd_bytealign ( 0, append00, offset); - append0_t[1] = amd_bytealign (append00, append01, offset); - append0_t[2] = amd_bytealign (append01, append02, offset); - append0_t[3] = amd_bytealign (append02, append03, offset); - append1_t[0] = amd_bytealign (append03, append10, offset); - append1_t[1] = amd_bytealign (append10, append11, offset); - append1_t[2] = amd_bytealign (append11, append12, offset); - append1_t[3] = amd_bytealign (append12, append13, offset); - append2_t[0] = amd_bytealign (append13, append20, offset); - append2_t[1] = amd_bytealign (append20, append21, offset); - append2_t[2] = amd_bytealign (append21, append22, offset); - append2_t[3] = amd_bytealign (append22, append23, offset); - append3_t[0] = amd_bytealign (append23, append30, offset); - append3_t[1] = amd_bytealign (append30, append31, offset); - append3_t[2] = amd_bytealign (append31, append32, offset); - append3_t[3] = amd_bytealign (append32, append33, offset); - append4_t[0] = amd_bytealign (append33, 0, offset); - append4_t[1] = 0; - append4_t[2] = 0; - append4_t[3] = 0; - - append0_t[0] = swap32 (append0_t[0]); - append0_t[1] = swap32 (append0_t[1]); - append0_t[2] = swap32 (append0_t[2]); - append0_t[3] = swap32 (append0_t[3]); - append1_t[0] = swap32 (append1_t[0]); - append1_t[1] = swap32 (append1_t[1]); - append1_t[2] = swap32 (append1_t[2]); - append1_t[3] = swap32 (append1_t[3]); - append2_t[0] = swap32 (append2_t[0]); - append2_t[1] = swap32 (append2_t[1]); - append2_t[2] = swap32 (append2_t[2]); - append2_t[3] = swap32 (append2_t[3]); - append3_t[0] = swap32 (append3_t[0]); - append3_t[1] = swap32 (append3_t[1]); - append3_t[2] = swap32 (append3_t[2]); - append3_t[3] = swap32 (append3_t[3]); - append4_t[0] = swap32 (append4_t[0]); - append4_t[1] = swap32 (append4_t[1]); - append4_t[2] = swap32 (append4_t[2]); - append4_t[3] = swap32 (append4_t[3]); - - #endif - - #ifdef IS_NV - - const int offset_minus_4 = 4 - mod; - - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - - u32x append00 = append0[0]; - u32x append01 = append0[1]; - u32x append02 = append0[2]; - u32x append03 = append0[3]; - u32x append10 = append1[0]; - u32x append11 = append1[1]; - u32x append12 = append1[2]; - u32x append13 = append1[3]; - u32x append20 = append2[0]; - u32x append21 = append2[1]; - u32x append22 = append2[2]; - u32x append23 = append2[3]; - u32x append30 = append3[0]; - u32x append31 = append3[1]; - u32x append32 = append3[2]; - u32x append33 = append3[3]; - - u32x append0_t[4]; - u32x append1_t[4]; - u32x append2_t[4]; - u32x append3_t[4]; - u32x append4_t[4]; - - append0_t[0] = __byte_perm ( 0, append00, selector); - append0_t[1] = __byte_perm (append00, append01, selector); - append0_t[2] = __byte_perm (append01, append02, selector); - append0_t[3] = __byte_perm (append02, append03, selector); - append1_t[0] = __byte_perm (append03, append10, selector); - append1_t[1] = __byte_perm (append10, append11, selector); - append1_t[2] = __byte_perm (append11, append12, selector); - append1_t[3] = __byte_perm (append12, append13, selector); - append2_t[0] = __byte_perm (append13, append20, selector); - append2_t[1] = __byte_perm (append20, append21, selector); - append2_t[2] = __byte_perm (append21, append22, selector); - append2_t[3] = __byte_perm (append22, append23, selector); - append3_t[0] = __byte_perm (append23, append30, selector); - append3_t[1] = __byte_perm (append30, append31, selector); - append3_t[2] = __byte_perm (append31, append32, selector); - append3_t[3] = __byte_perm (append32, append33, selector); - append4_t[0] = __byte_perm (append33, 0, selector); - append4_t[1] = 0; - append4_t[2] = 0; - append4_t[3] = 0; - - #endif - - switch (div) - { - case 0: block0[ 0] |= append0_t[0]; - block0[ 1] = append0_t[1]; - block0[ 2] = append0_t[2]; - block0[ 3] = append0_t[3]; - block0[ 4] = append1_t[0]; - block0[ 5] = append1_t[1]; - block0[ 6] = append1_t[2]; - block0[ 7] = append1_t[3]; - block0[ 8] = append2_t[0]; - block0[ 9] = append2_t[1]; - block0[10] = append2_t[2]; - block0[11] = append2_t[3]; - block0[12] = append3_t[0]; - block0[13] = append3_t[1]; - block0[14] = append3_t[2]; - block0[15] = append3_t[3]; - block1[ 0] = append4_t[0]; - block1[ 1] = append4_t[1]; - block1[ 2] = append4_t[2]; - block1[ 3] = append4_t[3]; - break; - - case 1: block0[ 1] |= append0_t[0]; - block0[ 2] = append0_t[1]; - block0[ 3] = append0_t[2]; - block0[ 4] = append0_t[3]; - block0[ 5] = append1_t[0]; - block0[ 6] = append1_t[1]; - block0[ 7] = append1_t[2]; - block0[ 8] = append1_t[3]; - block0[ 9] = append2_t[0]; - block0[10] = append2_t[1]; - block0[11] = append2_t[2]; - block0[12] = append2_t[3]; - block0[13] = append3_t[0]; - block0[14] = append3_t[1]; - block0[15] = append3_t[2]; - block1[ 0] = append3_t[3]; - block1[ 1] = append4_t[0]; - block1[ 2] = append4_t[1]; - block1[ 3] = append4_t[2]; - block1[ 4] = append4_t[3]; - break; - - case 2: block0[ 2] |= append0_t[0]; - block0[ 3] = append0_t[1]; - block0[ 4] = append0_t[2]; - block0[ 5] = append0_t[3]; - block0[ 6] = append1_t[0]; - block0[ 7] = append1_t[1]; - block0[ 8] = append1_t[2]; - block0[ 9] = append1_t[3]; - block0[10] = append2_t[0]; - block0[11] = append2_t[1]; - block0[12] = append2_t[2]; - block0[13] = append2_t[3]; - block0[14] = append3_t[0]; - block0[15] = append3_t[1]; - block1[ 0] = append3_t[2]; - block1[ 1] = append3_t[3]; - block1[ 2] = append4_t[0]; - block1[ 3] = append4_t[1]; - block1[ 4] = append4_t[2]; - block1[ 5] = append4_t[3]; - break; - - case 3: block0[ 3] |= append0_t[0]; - block0[ 4] = append0_t[1]; - block0[ 5] = append0_t[2]; - block0[ 6] = append0_t[3]; - block0[ 7] = append1_t[0]; - block0[ 8] = append1_t[1]; - block0[ 9] = append1_t[2]; - block0[10] = append1_t[3]; - block0[11] = append2_t[0]; - block0[12] = append2_t[1]; - block0[13] = append2_t[2]; - block0[14] = append2_t[3]; - block0[15] = append3_t[0]; - block1[ 0] = append3_t[1]; - block1[ 1] = append3_t[2]; - block1[ 2] = append3_t[3]; - block1[ 3] = append4_t[0]; - block1[ 4] = append4_t[1]; - block1[ 5] = append4_t[2]; - block1[ 6] = append4_t[3]; - break; - - case 4: block0[ 4] |= append0_t[0]; - block0[ 5] = append0_t[1]; - block0[ 6] = append0_t[2]; - block0[ 7] = append0_t[3]; - block0[ 8] = append1_t[0]; - block0[ 9] = append1_t[1]; - block0[10] = append1_t[2]; - block0[11] = append1_t[3]; - block0[12] = append2_t[0]; - block0[13] = append2_t[1]; - block0[14] = append2_t[2]; - block0[15] = append2_t[3]; - block1[ 0] = append3_t[0]; - block1[ 1] = append3_t[1]; - block1[ 2] = append3_t[2]; - block1[ 3] = append3_t[3]; - block1[ 4] = append4_t[0]; - block1[ 5] = append4_t[1]; - block1[ 6] = append4_t[2]; - block1[ 7] = append4_t[3]; - break; - - case 5: block0[ 5] |= append0_t[0]; - block0[ 6] = append0_t[1]; - block0[ 7] = append0_t[2]; - block0[ 8] = append0_t[3]; - block0[ 9] = append1_t[0]; - block0[10] = append1_t[1]; - block0[11] = append1_t[2]; - block0[12] = append1_t[3]; - block0[13] = append2_t[0]; - block0[14] = append2_t[1]; - block0[15] = append2_t[2]; - block1[ 0] = append2_t[3]; - block1[ 1] = append3_t[0]; - block1[ 2] = append3_t[1]; - block1[ 3] = append3_t[2]; - block1[ 4] = append3_t[3]; - block1[ 5] = append4_t[0]; - block1[ 6] = append4_t[1]; - block1[ 7] = append4_t[2]; - block1[ 8] = append4_t[3]; - break; - - case 6: block0[ 6] |= append0_t[0]; - block0[ 7] = append0_t[1]; - block0[ 8] = append0_t[2]; - block0[ 9] = append0_t[3]; - block0[10] = append1_t[0]; - block0[11] = append1_t[1]; - block0[12] = append1_t[2]; - block0[13] = append1_t[3]; - block0[14] = append2_t[0]; - block0[15] = append2_t[1]; - block1[ 0] = append2_t[2]; - block1[ 1] = append2_t[3]; - block1[ 2] = append3_t[0]; - block1[ 3] = append3_t[1]; - block1[ 4] = append3_t[2]; - block1[ 5] = append3_t[3]; - block1[ 6] = append4_t[0]; - block1[ 7] = append4_t[1]; - block1[ 8] = append4_t[2]; - block1[ 9] = append4_t[3]; - break; - - case 7: block0[ 7] |= append0_t[0]; - block0[ 8] = append0_t[1]; - block0[ 9] = append0_t[2]; - block0[10] = append0_t[3]; - block0[11] = append1_t[0]; - block0[12] = append1_t[1]; - block0[13] = append1_t[2]; - block0[14] = append1_t[3]; - block0[15] = append2_t[0]; - block1[ 0] = append2_t[1]; - block1[ 1] = append2_t[2]; - block1[ 2] = append2_t[3]; - block1[ 3] = append3_t[0]; - block1[ 4] = append3_t[1]; - block1[ 5] = append3_t[2]; - block1[ 6] = append3_t[3]; - block1[ 7] = append4_t[0]; - block1[ 8] = append4_t[1]; - block1[ 9] = append4_t[2]; - block1[10] = append4_t[3]; - break; - - case 8: block0[ 8] |= append0_t[0]; - block0[ 9] = append0_t[1]; - block0[10] = append0_t[2]; - block0[11] = append0_t[3]; - block0[12] = append1_t[0]; - block0[13] = append1_t[1]; - block0[14] = append1_t[2]; - block0[15] = append1_t[3]; - block1[ 0] = append2_t[0]; - block1[ 1] = append2_t[1]; - block1[ 2] = append2_t[2]; - block1[ 3] = append2_t[3]; - block1[ 4] = append3_t[0]; - block1[ 5] = append3_t[1]; - block1[ 6] = append3_t[2]; - block1[ 7] = append3_t[3]; - block1[ 8] = append4_t[0]; - block1[ 9] = append4_t[1]; - block1[10] = append4_t[2]; - block1[11] = append4_t[3]; - break; - - case 9: block0[ 9] |= append0_t[0]; - block0[10] = append0_t[1]; - block0[11] = append0_t[2]; - block0[12] = append0_t[3]; - block0[13] = append1_t[0]; - block0[14] = append1_t[1]; - block0[15] = append1_t[2]; - block1[ 0] = append1_t[3]; - block1[ 1] = append2_t[0]; - block1[ 2] = append2_t[1]; - block1[ 3] = append2_t[2]; - block1[ 4] = append2_t[3]; - block1[ 5] = append3_t[0]; - block1[ 6] = append3_t[1]; - block1[ 7] = append3_t[2]; - block1[ 8] = append3_t[3]; - block1[ 9] = append4_t[0]; - block1[10] = append4_t[1]; - block1[11] = append4_t[2]; - block1[12] = append4_t[3]; - break; - - case 10: block0[10] |= append0_t[0]; - block0[11] = append0_t[1]; - block0[12] = append0_t[2]; - block0[13] = append0_t[3]; - block0[14] = append1_t[0]; - block0[15] = append1_t[1]; - block1[ 0] = append1_t[2]; - block1[ 1] = append1_t[3]; - block1[ 2] = append2_t[0]; - block1[ 3] = append2_t[1]; - block1[ 4] = append2_t[2]; - block1[ 5] = append2_t[3]; - block1[ 6] = append3_t[0]; - block1[ 7] = append3_t[1]; - block1[ 8] = append3_t[2]; - block1[ 9] = append3_t[3]; - block1[10] = append4_t[0]; - block1[11] = append4_t[1]; - block1[12] = append4_t[2]; - block1[13] = append4_t[3]; - break; - - case 11: block0[11] |= append0_t[0]; - block0[12] = append0_t[1]; - block0[13] = append0_t[2]; - block0[14] = append0_t[3]; - block0[15] = append1_t[0]; - block1[ 0] = append1_t[1]; - block1[ 1] = append1_t[2]; - block1[ 2] = append1_t[3]; - block1[ 3] = append2_t[0]; - block1[ 4] = append2_t[1]; - block1[ 5] = append2_t[2]; - block1[ 6] = append2_t[3]; - block1[ 7] = append3_t[0]; - block1[ 8] = append3_t[1]; - block1[ 9] = append3_t[2]; - block1[10] = append3_t[3]; - block1[11] = append4_t[0]; - block1[12] = append4_t[1]; - block1[13] = append4_t[2]; - block1[14] = append4_t[3]; - break; - - case 12: block0[12] |= append0_t[0]; - block0[13] = append0_t[1]; - block0[14] = append0_t[2]; - block0[15] = append0_t[3]; - block1[ 0] = append1_t[0]; - block1[ 1] = append1_t[1]; - block1[ 2] = append1_t[2]; - block1[ 3] = append1_t[3]; - block1[ 4] = append2_t[0]; - block1[ 5] = append2_t[1]; - block1[ 6] = append2_t[2]; - block1[ 7] = append2_t[3]; - block1[ 8] = append3_t[0]; - block1[ 9] = append3_t[1]; - block1[10] = append3_t[2]; - block1[11] = append3_t[3]; - block1[12] = append4_t[0]; - block1[13] = append4_t[1]; - block1[14] = append4_t[2]; - block1[15] = append4_t[3]; - break; - - case 13: block0[13] |= append0_t[0]; - block0[14] = append0_t[1]; - block0[15] = append0_t[2]; - block1[ 0] = append0_t[3]; - block1[ 1] = append1_t[0]; - block1[ 2] = append1_t[1]; - block1[ 3] = append1_t[2]; - block1[ 4] = append1_t[3]; - block1[ 5] = append2_t[0]; - block1[ 6] = append2_t[1]; - block1[ 7] = append2_t[2]; - block1[ 8] = append2_t[3]; - block1[ 9] = append3_t[0]; - block1[10] = append3_t[1]; - block1[11] = append3_t[2]; - block1[12] = append3_t[3]; - block1[13] = append4_t[0]; - block1[14] = append4_t[1]; - block1[15] = append4_t[2]; - break; - - case 14: block0[14] |= append0_t[0]; - block0[15] = append0_t[1]; - block1[ 0] = append0_t[2]; - block1[ 1] = append0_t[3]; - block1[ 2] = append1_t[0]; - block1[ 3] = append1_t[1]; - block1[ 4] = append1_t[2]; - block1[ 5] = append1_t[3]; - block1[ 6] = append2_t[0]; - block1[ 7] = append2_t[1]; - block1[ 8] = append2_t[2]; - block1[ 9] = append2_t[3]; - block1[10] = append3_t[0]; - block1[11] = append3_t[1]; - block1[12] = append3_t[2]; - block1[13] = append3_t[3]; - block1[14] = append4_t[0]; - block1[15] = append4_t[1]; - break; - - case 15: block0[15] |= append0_t[0]; - block1[ 0] = append0_t[1]; - block1[ 1] = append0_t[2]; - block1[ 2] = append0_t[3]; - block1[ 3] = append1_t[1]; - block1[ 4] = append1_t[2]; - block1[ 5] = append1_t[3]; - block1[ 6] = append1_t[0]; - block1[ 7] = append2_t[0]; - block1[ 8] = append2_t[1]; - block1[ 9] = append2_t[2]; - block1[10] = append2_t[3]; - block1[11] = append3_t[0]; - block1[12] = append3_t[1]; - block1[13] = append3_t[2]; - block1[14] = append3_t[3]; - block1[15] = append4_t[0]; - break; - - case 16: block1[ 0] |= append0_t[0]; - block1[ 1] = append0_t[1]; - block1[ 2] = append0_t[2]; - block1[ 3] = append0_t[3]; - block1[ 4] = append1_t[0]; - block1[ 5] = append1_t[1]; - block1[ 6] = append1_t[2]; - block1[ 7] = append1_t[3]; - block1[ 8] = append2_t[0]; - block1[ 9] = append2_t[1]; - block1[10] = append2_t[2]; - block1[11] = append2_t[3]; - block1[12] = append3_t[0]; - block1[13] = append3_t[1]; - block1[14] = append3_t[2]; - block1[15] = append3_t[3]; - break; - - case 17: block1[ 1] |= append0_t[0]; - block1[ 2] = append0_t[1]; - block1[ 3] = append0_t[2]; - block1[ 4] = append0_t[3]; - block1[ 5] = append1_t[0]; - block1[ 6] = append1_t[1]; - block1[ 7] = append1_t[2]; - block1[ 8] = append1_t[3]; - block1[ 9] = append2_t[0]; - block1[10] = append2_t[1]; - block1[11] = append2_t[2]; - block1[12] = append2_t[3]; - block1[13] = append3_t[0]; - block1[14] = append3_t[1]; - block1[15] = append3_t[2]; - break; - - case 18: block1[ 2] |= append0_t[0]; - block1[ 3] = append0_t[1]; - block1[ 4] = append0_t[2]; - block1[ 5] = append0_t[3]; - block1[ 6] = append1_t[0]; - block1[ 7] = append1_t[1]; - block1[ 8] = append1_t[2]; - block1[ 9] = append1_t[3]; - block1[10] = append2_t[0]; - block1[11] = append2_t[1]; - block1[12] = append2_t[2]; - block1[13] = append2_t[3]; - block1[14] = append3_t[0]; - block1[15] = append3_t[1]; - break; - - case 19: block1[ 3] |= append0_t[0]; - block1[ 4] = append0_t[1]; - block1[ 5] = append0_t[2]; - block1[ 6] = append0_t[3]; - block1[ 7] = append1_t[0]; - block1[ 8] = append1_t[1]; - block1[ 9] = append1_t[2]; - block1[10] = append1_t[3]; - block1[11] = append2_t[0]; - block1[12] = append2_t[1]; - block1[13] = append2_t[2]; - block1[14] = append2_t[3]; - block1[15] = append3_t[0]; - break; - - case 20: block1[ 4] |= append0_t[0]; - block1[ 5] = append0_t[1]; - block1[ 6] = append0_t[2]; - block1[ 7] = append0_t[3]; - block1[ 8] = append1_t[0]; - block1[ 9] = append1_t[1]; - block1[10] = append1_t[2]; - block1[11] = append1_t[3]; - block1[12] = append2_t[0]; - block1[13] = append2_t[1]; - block1[14] = append2_t[2]; - block1[15] = append2_t[3]; - break; - - case 21: block1[ 5] |= append0_t[0]; - block1[ 6] = append0_t[1]; - block1[ 7] = append0_t[2]; - block1[ 8] = append0_t[3]; - block1[ 9] = append1_t[0]; - block1[10] = append1_t[1]; - block1[11] = append1_t[2]; - block1[12] = append1_t[3]; - block1[13] = append2_t[0]; - block1[14] = append2_t[1]; - block1[15] = append2_t[2]; - break; - - case 22: block1[ 6] |= append0_t[0]; - block1[ 7] = append0_t[1]; - block1[ 8] = append0_t[2]; - block1[ 9] = append0_t[3]; - block1[10] = append1_t[0]; - block1[11] = append1_t[1]; - block1[12] = append1_t[2]; - block1[13] = append1_t[3]; - block1[14] = append2_t[0]; - block1[15] = append2_t[1]; - break; - - case 23: block1[ 7] |= append0_t[0]; - block1[ 8] = append0_t[1]; - block1[ 9] = append0_t[2]; - block1[10] = append0_t[3]; - block1[11] = append1_t[0]; - block1[12] = append1_t[1]; - block1[13] = append1_t[2]; - block1[14] = append1_t[3]; - block1[15] = append2_t[0]; - break; - - case 24: block1[ 8] |= append0_t[0]; - block1[ 9] = append0_t[1]; - block1[10] = append0_t[2]; - block1[11] = append0_t[3]; - block1[12] = append1_t[0]; - block1[13] = append1_t[1]; - block1[14] = append1_t[2]; - block1[15] = append1_t[3]; - break; - - case 25: block1[ 9] |= append0_t[0]; - block1[10] = append0_t[1]; - block1[11] = append0_t[2]; - block1[12] = append0_t[3]; - block1[13] = append1_t[0]; - block1[14] = append1_t[1]; - block1[15] = append1_t[2]; - break; - - case 26: block1[10] |= append0_t[0]; - block1[11] = append0_t[1]; - block1[12] = append0_t[2]; - block1[13] = append0_t[3]; - block1[14] = append1_t[0]; - block1[15] = append1_t[1]; - break; - - case 27: block1[11] |= append0_t[0]; - block1[12] = append0_t[1]; - block1[13] = append0_t[2]; - block1[14] = append0_t[3]; - block1[15] = append1_t[0]; - break; - - case 28: block1[12] |= append0_t[0]; - block1[13] = append0_t[1]; - block1[14] = append0_t[2]; - block1[15] = append0_t[3]; - break; - - case 29: block1[13] |= append0_t[0]; - block1[14] = append0_t[1]; - block1[15] = append0_t[2]; - break; - - case 30: block1[14] |= append0_t[0]; - block1[15] = append0_t[1]; - break; - } - - u32 new_len = offset + append_len; - - return new_len; -} - -void m11400m_0_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *l_bin2asc) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - - /** - * salt - */ - - const u32 salt_len = esalt_bufs[digests_offset].salt_len; // not a bug, we need to get it from the esalt - - const u32 pw_salt_len = salt_len + pw_len; - - u32 salt_buf0[16]; - u32 salt_buf1[16]; - - salt_buf0[ 0] = esalt_bufs[digests_offset].salt_buf[ 0]; - salt_buf0[ 1] = esalt_bufs[digests_offset].salt_buf[ 1]; - salt_buf0[ 2] = esalt_bufs[digests_offset].salt_buf[ 2]; - salt_buf0[ 3] = esalt_bufs[digests_offset].salt_buf[ 3]; - salt_buf0[ 4] = esalt_bufs[digests_offset].salt_buf[ 4]; - salt_buf0[ 5] = esalt_bufs[digests_offset].salt_buf[ 5]; - salt_buf0[ 6] = esalt_bufs[digests_offset].salt_buf[ 6]; - salt_buf0[ 7] = esalt_bufs[digests_offset].salt_buf[ 7]; - salt_buf0[ 8] = esalt_bufs[digests_offset].salt_buf[ 8]; - salt_buf0[ 9] = esalt_bufs[digests_offset].salt_buf[ 9]; - salt_buf0[10] = esalt_bufs[digests_offset].salt_buf[10]; - salt_buf0[11] = esalt_bufs[digests_offset].salt_buf[11]; - salt_buf0[12] = esalt_bufs[digests_offset].salt_buf[12]; - salt_buf0[13] = esalt_bufs[digests_offset].salt_buf[13]; - salt_buf0[14] = esalt_bufs[digests_offset].salt_buf[14]; - salt_buf0[15] = esalt_bufs[digests_offset].salt_buf[15]; - salt_buf1[ 0] = esalt_bufs[digests_offset].salt_buf[16]; - salt_buf1[ 1] = esalt_bufs[digests_offset].salt_buf[17]; - salt_buf1[ 2] = esalt_bufs[digests_offset].salt_buf[18]; - salt_buf1[ 3] = esalt_bufs[digests_offset].salt_buf[19]; - salt_buf1[ 4] = esalt_bufs[digests_offset].salt_buf[20]; - salt_buf1[ 5] = esalt_bufs[digests_offset].salt_buf[21]; - salt_buf1[ 6] = esalt_bufs[digests_offset].salt_buf[22]; - salt_buf1[ 7] = esalt_bufs[digests_offset].salt_buf[23]; - salt_buf1[ 8] = esalt_bufs[digests_offset].salt_buf[24]; - salt_buf1[ 9] = esalt_bufs[digests_offset].salt_buf[25]; - salt_buf1[10] = esalt_bufs[digests_offset].salt_buf[26]; - salt_buf1[11] = esalt_bufs[digests_offset].salt_buf[27]; - salt_buf1[12] = esalt_bufs[digests_offset].salt_buf[28]; - salt_buf1[13] = esalt_bufs[digests_offset].salt_buf[29]; - salt_buf1[14] = 0; - salt_buf1[15] = 0; - - /** - * esalt - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - - u32 esalt_buf0[16]; - u32 esalt_buf1[16]; - - esalt_buf0[ 0] = esalt_bufs[digests_offset].esalt_buf[ 0]; - esalt_buf0[ 1] = esalt_bufs[digests_offset].esalt_buf[ 1]; - esalt_buf0[ 2] = esalt_bufs[digests_offset].esalt_buf[ 2]; - esalt_buf0[ 3] = esalt_bufs[digests_offset].esalt_buf[ 3]; - esalt_buf0[ 4] = esalt_bufs[digests_offset].esalt_buf[ 4]; - esalt_buf0[ 5] = esalt_bufs[digests_offset].esalt_buf[ 5]; - esalt_buf0[ 6] = esalt_bufs[digests_offset].esalt_buf[ 6]; - esalt_buf0[ 7] = esalt_bufs[digests_offset].esalt_buf[ 7]; - esalt_buf0[ 8] = esalt_bufs[digests_offset].esalt_buf[ 8]; - esalt_buf0[ 9] = esalt_bufs[digests_offset].esalt_buf[ 9]; - esalt_buf0[10] = esalt_bufs[digests_offset].esalt_buf[10]; - esalt_buf0[11] = esalt_bufs[digests_offset].esalt_buf[11]; - esalt_buf0[12] = esalt_bufs[digests_offset].esalt_buf[12]; - esalt_buf0[13] = esalt_bufs[digests_offset].esalt_buf[13]; - esalt_buf0[14] = esalt_bufs[digests_offset].esalt_buf[14]; - esalt_buf0[15] = esalt_bufs[digests_offset].esalt_buf[15]; - esalt_buf1[ 0] = esalt_bufs[digests_offset].esalt_buf[16]; - esalt_buf1[ 1] = esalt_bufs[digests_offset].esalt_buf[17]; - esalt_buf1[ 2] = esalt_bufs[digests_offset].esalt_buf[18]; - esalt_buf1[ 3] = esalt_bufs[digests_offset].esalt_buf[19]; - esalt_buf1[ 4] = esalt_bufs[digests_offset].esalt_buf[20]; - esalt_buf1[ 5] = esalt_bufs[digests_offset].esalt_buf[21]; - esalt_buf1[ 6] = esalt_bufs[digests_offset].esalt_buf[22]; - esalt_buf1[ 7] = esalt_bufs[digests_offset].esalt_buf[23]; - esalt_buf1[ 8] = esalt_bufs[digests_offset].esalt_buf[24]; - esalt_buf1[ 9] = esalt_bufs[digests_offset].esalt_buf[25]; - esalt_buf1[10] = esalt_bufs[digests_offset].esalt_buf[26]; - esalt_buf1[11] = esalt_bufs[digests_offset].esalt_buf[27]; - esalt_buf1[12] = esalt_bufs[digests_offset].esalt_buf[28]; - esalt_buf1[13] = esalt_bufs[digests_offset].esalt_buf[29]; - esalt_buf1[14] = esalt_bufs[digests_offset].esalt_buf[30]; - esalt_buf1[15] = esalt_bufs[digests_offset].esalt_buf[31]; - - const u32 digest_esalt_len = 32 + esalt_len; - - /** - * loop - */ - - u32 w0l = w0[0]; - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - const u32x w0r = ix_create_bft (bfs_buf, il_pos); - - const u32x w0lr = w0l | w0r; - - /* - * HA1 = md5 ($salt . $pass) - */ - - // append the pass to the salt - - u32x block0[16]; - u32x block1[16]; - - block0[ 0] = salt_buf0[ 0]; - block0[ 1] = salt_buf0[ 1]; - block0[ 2] = salt_buf0[ 2]; - block0[ 3] = salt_buf0[ 3]; - block0[ 4] = salt_buf0[ 4]; - block0[ 5] = salt_buf0[ 5]; - block0[ 6] = salt_buf0[ 6]; - block0[ 7] = salt_buf0[ 7]; - block0[ 8] = salt_buf0[ 8]; - block0[ 9] = salt_buf0[ 9]; - block0[10] = salt_buf0[10]; - block0[11] = salt_buf0[11]; - block0[12] = salt_buf0[12]; - block0[13] = salt_buf0[13]; - block0[14] = salt_buf0[14]; - block0[15] = salt_buf0[15]; - block1[ 0] = salt_buf1[ 0]; - block1[ 1] = salt_buf1[ 1]; - block1[ 2] = salt_buf1[ 2]; - block1[ 3] = salt_buf1[ 3]; - block1[ 4] = salt_buf1[ 4]; - block1[ 5] = salt_buf1[ 5]; - block1[ 6] = salt_buf1[ 6]; - block1[ 7] = salt_buf1[ 7]; - block1[ 8] = salt_buf1[ 8]; - block1[ 9] = salt_buf1[ 9]; - block1[10] = salt_buf1[10]; - block1[11] = salt_buf1[11]; - block1[12] = salt_buf1[12]; - block1[13] = salt_buf1[13]; - block1[14] = salt_buf1[14]; - block1[15] = salt_buf1[15]; - - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; - - w0_t[0] = w0lr; - w0_t[1] = w0[1]; - w0_t[2] = w0[2]; - w0_t[3] = w0[3]; - w1_t[0] = w1[0]; - w1_t[1] = w1[1]; - w1_t[2] = w1[2]; - w1_t[3] = w1[3]; - w2_t[0] = w2[0]; - w2_t[1] = w2[1]; - w2_t[2] = w2[2]; - w2_t[3] = w2[3]; - w3_t[0] = w3[0]; - w3_t[1] = w3[1]; - w3_t[2] = w3[2]; - w3_t[3] = w3[3]; - - memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len); - - w0_t[0] = block0[ 0]; - w0_t[1] = block0[ 1]; - w0_t[2] = block0[ 2]; - w0_t[3] = block0[ 3]; - w1_t[0] = block0[ 4]; - w1_t[1] = block0[ 5]; - w1_t[2] = block0[ 6]; - w1_t[3] = block0[ 7]; - w2_t[0] = block0[ 8]; - w2_t[1] = block0[ 9]; - w2_t[2] = block0[10]; - w2_t[3] = block0[11]; - w3_t[0] = block0[12]; - w3_t[1] = block0[13]; - w3_t[2] = pw_salt_len * 8; - w3_t[3] = 0; - - // md5 - - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - /* - * final = md5 ($HA1 . $esalt) - * we have at least 2 MD5 blocks/transformations, but we might need 3 - */ - - w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - w2_t[0] = esalt_buf0[0]; - w2_t[1] = esalt_buf0[1]; - w2_t[2] = esalt_buf0[2]; - w2_t[3] = esalt_buf0[3]; - w3_t[0] = esalt_buf0[4]; - w3_t[1] = esalt_buf0[5]; - w3_t[2] = esalt_buf0[6]; - w3_t[3] = esalt_buf0[7]; - - // md5 - // 1st transform - - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - // 2nd transform - - w0_t[0] = esalt_buf0[ 8]; - w0_t[1] = esalt_buf0[ 9]; - w0_t[2] = esalt_buf0[10]; - w0_t[3] = esalt_buf0[11]; - w1_t[0] = esalt_buf0[12]; - w1_t[1] = esalt_buf0[13]; - w1_t[2] = esalt_buf0[14]; - w1_t[3] = esalt_buf0[15]; - w2_t[0] = esalt_buf1[ 0]; - w2_t[1] = esalt_buf1[ 1]; - w2_t[2] = esalt_buf1[ 2]; - w2_t[3] = esalt_buf1[ 3]; - w3_t[0] = esalt_buf1[ 4]; - w3_t[1] = esalt_buf1[ 5]; - w3_t[2] = digest_esalt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - COMPARE_M_SIMD (a, d, c, b); - } -} - -void m11400m_0_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *l_bin2asc) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - - /** - * salt - */ - - const u32 salt_len = esalt_bufs[digests_offset].salt_len; // not a bug, we need to get it from the esalt - - const u32 pw_salt_len = salt_len + pw_len; - - u32 salt_buf0[16]; - u32 salt_buf1[16]; - - salt_buf0[ 0] = esalt_bufs[digests_offset].salt_buf[ 0]; - salt_buf0[ 1] = esalt_bufs[digests_offset].salt_buf[ 1]; - salt_buf0[ 2] = esalt_bufs[digests_offset].salt_buf[ 2]; - salt_buf0[ 3] = esalt_bufs[digests_offset].salt_buf[ 3]; - salt_buf0[ 4] = esalt_bufs[digests_offset].salt_buf[ 4]; - salt_buf0[ 5] = esalt_bufs[digests_offset].salt_buf[ 5]; - salt_buf0[ 6] = esalt_bufs[digests_offset].salt_buf[ 6]; - salt_buf0[ 7] = esalt_bufs[digests_offset].salt_buf[ 7]; - salt_buf0[ 8] = esalt_bufs[digests_offset].salt_buf[ 8]; - salt_buf0[ 9] = esalt_bufs[digests_offset].salt_buf[ 9]; - salt_buf0[10] = esalt_bufs[digests_offset].salt_buf[10]; - salt_buf0[11] = esalt_bufs[digests_offset].salt_buf[11]; - salt_buf0[12] = esalt_bufs[digests_offset].salt_buf[12]; - salt_buf0[13] = esalt_bufs[digests_offset].salt_buf[13]; - salt_buf0[14] = esalt_bufs[digests_offset].salt_buf[14]; - salt_buf0[15] = esalt_bufs[digests_offset].salt_buf[15]; - salt_buf1[ 0] = esalt_bufs[digests_offset].salt_buf[16]; - salt_buf1[ 1] = esalt_bufs[digests_offset].salt_buf[17]; - salt_buf1[ 2] = esalt_bufs[digests_offset].salt_buf[18]; - salt_buf1[ 3] = esalt_bufs[digests_offset].salt_buf[19]; - salt_buf1[ 4] = esalt_bufs[digests_offset].salt_buf[20]; - salt_buf1[ 5] = esalt_bufs[digests_offset].salt_buf[21]; - salt_buf1[ 6] = esalt_bufs[digests_offset].salt_buf[22]; - salt_buf1[ 7] = esalt_bufs[digests_offset].salt_buf[23]; - salt_buf1[ 8] = esalt_bufs[digests_offset].salt_buf[24]; - salt_buf1[ 9] = esalt_bufs[digests_offset].salt_buf[25]; - salt_buf1[10] = esalt_bufs[digests_offset].salt_buf[26]; - salt_buf1[11] = esalt_bufs[digests_offset].salt_buf[27]; - salt_buf1[12] = esalt_bufs[digests_offset].salt_buf[28]; - salt_buf1[13] = esalt_bufs[digests_offset].salt_buf[29]; - salt_buf1[14] = 0; - salt_buf1[15] = 0; - - /** - * esalt - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - - u32 esalt_buf0[16]; - u32 esalt_buf1[16]; - u32 esalt_buf2[16]; - - esalt_buf0[ 0] = esalt_bufs[digests_offset].esalt_buf[ 0]; - esalt_buf0[ 1] = esalt_bufs[digests_offset].esalt_buf[ 1]; - esalt_buf0[ 2] = esalt_bufs[digests_offset].esalt_buf[ 2]; - esalt_buf0[ 3] = esalt_bufs[digests_offset].esalt_buf[ 3]; - esalt_buf0[ 4] = esalt_bufs[digests_offset].esalt_buf[ 4]; - esalt_buf0[ 5] = esalt_bufs[digests_offset].esalt_buf[ 5]; - esalt_buf0[ 6] = esalt_bufs[digests_offset].esalt_buf[ 6]; - esalt_buf0[ 7] = esalt_bufs[digests_offset].esalt_buf[ 7]; - esalt_buf0[ 8] = esalt_bufs[digests_offset].esalt_buf[ 8]; - esalt_buf0[ 9] = esalt_bufs[digests_offset].esalt_buf[ 9]; - esalt_buf0[10] = esalt_bufs[digests_offset].esalt_buf[10]; - esalt_buf0[11] = esalt_bufs[digests_offset].esalt_buf[11]; - esalt_buf0[12] = esalt_bufs[digests_offset].esalt_buf[12]; - esalt_buf0[13] = esalt_bufs[digests_offset].esalt_buf[13]; - esalt_buf0[14] = esalt_bufs[digests_offset].esalt_buf[14]; - esalt_buf0[15] = esalt_bufs[digests_offset].esalt_buf[15]; - esalt_buf1[ 0] = esalt_bufs[digests_offset].esalt_buf[16]; - esalt_buf1[ 1] = esalt_bufs[digests_offset].esalt_buf[17]; - esalt_buf1[ 2] = esalt_bufs[digests_offset].esalt_buf[18]; - esalt_buf1[ 3] = esalt_bufs[digests_offset].esalt_buf[19]; - esalt_buf1[ 4] = esalt_bufs[digests_offset].esalt_buf[20]; - esalt_buf1[ 5] = esalt_bufs[digests_offset].esalt_buf[21]; - esalt_buf1[ 6] = esalt_bufs[digests_offset].esalt_buf[22]; - esalt_buf1[ 7] = esalt_bufs[digests_offset].esalt_buf[23]; - esalt_buf1[ 8] = esalt_bufs[digests_offset].esalt_buf[24]; - esalt_buf1[ 9] = esalt_bufs[digests_offset].esalt_buf[25]; - esalt_buf1[10] = esalt_bufs[digests_offset].esalt_buf[26]; - esalt_buf1[11] = esalt_bufs[digests_offset].esalt_buf[27]; - esalt_buf1[12] = esalt_bufs[digests_offset].esalt_buf[28]; - esalt_buf1[13] = esalt_bufs[digests_offset].esalt_buf[29]; - esalt_buf1[14] = esalt_bufs[digests_offset].esalt_buf[30]; - esalt_buf1[15] = esalt_bufs[digests_offset].esalt_buf[31]; - esalt_buf2[ 0] = esalt_bufs[digests_offset].esalt_buf[32]; - esalt_buf2[ 1] = esalt_bufs[digests_offset].esalt_buf[33]; - esalt_buf2[ 2] = esalt_bufs[digests_offset].esalt_buf[34]; - esalt_buf2[ 3] = esalt_bufs[digests_offset].esalt_buf[35]; - esalt_buf2[ 4] = esalt_bufs[digests_offset].esalt_buf[36]; - esalt_buf2[ 5] = esalt_bufs[digests_offset].esalt_buf[37]; - esalt_buf2[ 6] = 0; - esalt_buf2[ 7] = 0; - esalt_buf2[ 8] = 0; - esalt_buf2[ 9] = 0; - esalt_buf2[10] = 0; - esalt_buf2[11] = 0; - esalt_buf2[12] = 0; - esalt_buf2[13] = 0; - esalt_buf2[14] = 0; - esalt_buf2[15] = 0; - - const u32 digest_esalt_len = 32 + esalt_len; - - /** - * loop - */ - - u32 w0l = w0[0]; - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - const u32x w0r = ix_create_bft (bfs_buf, il_pos); - - const u32x w0lr = w0l | w0r; - - /* - * HA1 = md5 ($salt . $pass) - */ - - // append the pass to the salt - - u32x block0[16]; - u32x block1[16]; - - block0[ 0] = salt_buf0[ 0]; - block0[ 1] = salt_buf0[ 1]; - block0[ 2] = salt_buf0[ 2]; - block0[ 3] = salt_buf0[ 3]; - block0[ 4] = salt_buf0[ 4]; - block0[ 5] = salt_buf0[ 5]; - block0[ 6] = salt_buf0[ 6]; - block0[ 7] = salt_buf0[ 7]; - block0[ 8] = salt_buf0[ 8]; - block0[ 9] = salt_buf0[ 9]; - block0[10] = salt_buf0[10]; - block0[11] = salt_buf0[11]; - block0[12] = salt_buf0[12]; - block0[13] = salt_buf0[13]; - block0[14] = salt_buf0[14]; - block0[15] = salt_buf0[15]; - block1[ 0] = salt_buf1[ 0]; - block1[ 1] = salt_buf1[ 1]; - block1[ 2] = salt_buf1[ 2]; - block1[ 3] = salt_buf1[ 3]; - block1[ 4] = salt_buf1[ 4]; - block1[ 5] = salt_buf1[ 5]; - block1[ 6] = salt_buf1[ 6]; - block1[ 7] = salt_buf1[ 7]; - block1[ 8] = salt_buf1[ 8]; - block1[ 9] = salt_buf1[ 9]; - block1[10] = salt_buf1[10]; - block1[11] = salt_buf1[11]; - block1[12] = salt_buf1[12]; - block1[13] = salt_buf1[13]; - block1[14] = salt_buf1[14]; - block1[15] = salt_buf1[15]; - - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; - - w0_t[0] = w0lr; - w0_t[1] = w0[1]; - w0_t[2] = w0[2]; - w0_t[3] = w0[3]; - w1_t[0] = w1[0]; - w1_t[1] = w1[1]; - w1_t[2] = w1[2]; - w1_t[3] = w1[3]; - w2_t[0] = w2[0]; - w2_t[1] = w2[1]; - w2_t[2] = w2[2]; - w2_t[3] = w2[3]; - w3_t[0] = w3[0]; - w3_t[1] = w3[1]; - w3_t[2] = w3[2]; - w3_t[3] = w3[3]; - - memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len); - - w0_t[0] = block0[ 0]; - w0_t[1] = block0[ 1]; - w0_t[2] = block0[ 2]; - w0_t[3] = block0[ 3]; - w1_t[0] = block0[ 4]; - w1_t[1] = block0[ 5]; - w1_t[2] = block0[ 6]; - w1_t[3] = block0[ 7]; - w2_t[0] = block0[ 8]; - w2_t[1] = block0[ 9]; - w2_t[2] = block0[10]; - w2_t[3] = block0[11]; - w3_t[0] = block0[12]; - w3_t[1] = block0[13]; - w3_t[2] = pw_salt_len * 8; - w3_t[3] = 0; - - // md5 - - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - /* - * final = md5 ($HA1 . $esalt) - * we have at least 2 MD5 blocks/transformations, but we might need 3 - */ - - w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - w2_t[0] = esalt_buf0[0]; - w2_t[1] = esalt_buf0[1]; - w2_t[2] = esalt_buf0[2]; - w2_t[3] = esalt_buf0[3]; - w3_t[0] = esalt_buf0[4]; - w3_t[1] = esalt_buf0[5]; - w3_t[2] = esalt_buf0[6]; - w3_t[3] = esalt_buf0[7]; - - // md5 - // 1st transform - - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - // 2nd transform - - w0_t[0] = esalt_buf0[ 8]; - w0_t[1] = esalt_buf0[ 9]; - w0_t[2] = esalt_buf0[10]; - w0_t[3] = esalt_buf0[11]; - w1_t[0] = esalt_buf0[12]; - w1_t[1] = esalt_buf0[13]; - w1_t[2] = esalt_buf0[14]; - w1_t[3] = esalt_buf0[15]; - w2_t[0] = esalt_buf1[ 0]; - w2_t[1] = esalt_buf1[ 1]; - w2_t[2] = esalt_buf1[ 2]; - w2_t[3] = esalt_buf1[ 3]; - w3_t[0] = esalt_buf1[ 4]; - w3_t[1] = esalt_buf1[ 5]; - w3_t[2] = esalt_buf1[ 6]; - w3_t[3] = esalt_buf1[ 7]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - // this is for sure the final block - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - r_a = a; - r_b = b; - r_c = c; - r_d = d; - - w0_t[0] = esalt_buf1[ 8]; - w0_t[1] = esalt_buf1[ 9]; - w0_t[2] = esalt_buf1[10]; - w0_t[3] = esalt_buf1[11]; - w1_t[0] = esalt_buf1[12]; - w1_t[1] = esalt_buf1[13]; - w1_t[2] = esalt_buf1[14]; - w1_t[3] = esalt_buf1[15]; - w2_t[0] = esalt_buf2[ 0]; - w2_t[1] = esalt_buf2[ 1]; - w2_t[2] = esalt_buf2[ 2]; - w2_t[3] = esalt_buf2[ 3]; - w3_t[0] = esalt_buf2[ 4]; - w3_t[1] = esalt_buf2[ 5]; - w3_t[2] = digest_esalt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - COMPARE_M_SIMD (a, d, c, b); - } -} - -void m11400m_1_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *l_bin2asc) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - - /** - * salt - */ - - const u32 salt_len = esalt_bufs[digests_offset].salt_len; // not a bug, we need to get it from the esalt - - const u32 pw_salt_len = salt_len + pw_len; - - u32 salt_buf0[16]; - u32 salt_buf1[16]; - - salt_buf0[ 0] = esalt_bufs[digests_offset].salt_buf[ 0]; - salt_buf0[ 1] = esalt_bufs[digests_offset].salt_buf[ 1]; - salt_buf0[ 2] = esalt_bufs[digests_offset].salt_buf[ 2]; - salt_buf0[ 3] = esalt_bufs[digests_offset].salt_buf[ 3]; - salt_buf0[ 4] = esalt_bufs[digests_offset].salt_buf[ 4]; - salt_buf0[ 5] = esalt_bufs[digests_offset].salt_buf[ 5]; - salt_buf0[ 6] = esalt_bufs[digests_offset].salt_buf[ 6]; - salt_buf0[ 7] = esalt_bufs[digests_offset].salt_buf[ 7]; - salt_buf0[ 8] = esalt_bufs[digests_offset].salt_buf[ 8]; - salt_buf0[ 9] = esalt_bufs[digests_offset].salt_buf[ 9]; - salt_buf0[10] = esalt_bufs[digests_offset].salt_buf[10]; - salt_buf0[11] = esalt_bufs[digests_offset].salt_buf[11]; - salt_buf0[12] = esalt_bufs[digests_offset].salt_buf[12]; - salt_buf0[13] = esalt_bufs[digests_offset].salt_buf[13]; - salt_buf0[14] = esalt_bufs[digests_offset].salt_buf[14]; - salt_buf0[15] = esalt_bufs[digests_offset].salt_buf[15]; - salt_buf1[ 0] = esalt_bufs[digests_offset].salt_buf[16]; - salt_buf1[ 1] = esalt_bufs[digests_offset].salt_buf[17]; - salt_buf1[ 2] = esalt_bufs[digests_offset].salt_buf[18]; - salt_buf1[ 3] = esalt_bufs[digests_offset].salt_buf[19]; - salt_buf1[ 4] = esalt_bufs[digests_offset].salt_buf[20]; - salt_buf1[ 5] = esalt_bufs[digests_offset].salt_buf[21]; - salt_buf1[ 6] = esalt_bufs[digests_offset].salt_buf[22]; - salt_buf1[ 7] = esalt_bufs[digests_offset].salt_buf[23]; - salt_buf1[ 8] = esalt_bufs[digests_offset].salt_buf[24]; - salt_buf1[ 9] = esalt_bufs[digests_offset].salt_buf[25]; - salt_buf1[10] = esalt_bufs[digests_offset].salt_buf[26]; - salt_buf1[11] = esalt_bufs[digests_offset].salt_buf[27]; - salt_buf1[12] = esalt_bufs[digests_offset].salt_buf[28]; - salt_buf1[13] = esalt_bufs[digests_offset].salt_buf[29]; - salt_buf1[14] = 0; - salt_buf1[15] = 0; - - /** - * esalt - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - - u32 esalt_buf0[16]; - u32 esalt_buf1[16]; - - esalt_buf0[ 0] = esalt_bufs[digests_offset].esalt_buf[ 0]; - esalt_buf0[ 1] = esalt_bufs[digests_offset].esalt_buf[ 1]; - esalt_buf0[ 2] = esalt_bufs[digests_offset].esalt_buf[ 2]; - esalt_buf0[ 3] = esalt_bufs[digests_offset].esalt_buf[ 3]; - esalt_buf0[ 4] = esalt_bufs[digests_offset].esalt_buf[ 4]; - esalt_buf0[ 5] = esalt_bufs[digests_offset].esalt_buf[ 5]; - esalt_buf0[ 6] = esalt_bufs[digests_offset].esalt_buf[ 6]; - esalt_buf0[ 7] = esalt_bufs[digests_offset].esalt_buf[ 7]; - esalt_buf0[ 8] = esalt_bufs[digests_offset].esalt_buf[ 8]; - esalt_buf0[ 9] = esalt_bufs[digests_offset].esalt_buf[ 9]; - esalt_buf0[10] = esalt_bufs[digests_offset].esalt_buf[10]; - esalt_buf0[11] = esalt_bufs[digests_offset].esalt_buf[11]; - esalt_buf0[12] = esalt_bufs[digests_offset].esalt_buf[12]; - esalt_buf0[13] = esalt_bufs[digests_offset].esalt_buf[13]; - esalt_buf0[14] = esalt_bufs[digests_offset].esalt_buf[14]; - esalt_buf0[15] = esalt_bufs[digests_offset].esalt_buf[15]; - esalt_buf1[ 0] = esalt_bufs[digests_offset].esalt_buf[16]; - esalt_buf1[ 1] = esalt_bufs[digests_offset].esalt_buf[17]; - esalt_buf1[ 2] = esalt_bufs[digests_offset].esalt_buf[18]; - esalt_buf1[ 3] = esalt_bufs[digests_offset].esalt_buf[19]; - esalt_buf1[ 4] = esalt_bufs[digests_offset].esalt_buf[20]; - esalt_buf1[ 5] = esalt_bufs[digests_offset].esalt_buf[21]; - esalt_buf1[ 6] = esalt_bufs[digests_offset].esalt_buf[22]; - esalt_buf1[ 7] = esalt_bufs[digests_offset].esalt_buf[23]; - esalt_buf1[ 8] = esalt_bufs[digests_offset].esalt_buf[24]; - esalt_buf1[ 9] = esalt_bufs[digests_offset].esalt_buf[25]; - esalt_buf1[10] = esalt_bufs[digests_offset].esalt_buf[26]; - esalt_buf1[11] = esalt_bufs[digests_offset].esalt_buf[27]; - esalt_buf1[12] = esalt_bufs[digests_offset].esalt_buf[28]; - esalt_buf1[13] = esalt_bufs[digests_offset].esalt_buf[29]; - esalt_buf1[14] = esalt_bufs[digests_offset].esalt_buf[30]; - esalt_buf1[15] = esalt_bufs[digests_offset].esalt_buf[31]; - - const u32 digest_esalt_len = 32 + esalt_len; - - /** - * loop - */ - - u32 w0l = w0[0]; - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - const u32x w0r = ix_create_bft (bfs_buf, il_pos); - - const u32x w0lr = w0l | w0r; - - /* - * HA1 = md5 ($salt . $pass) - */ - - // append the pass to the salt - - u32x block0[16]; - u32x block1[16]; - - block0[ 0] = salt_buf0[ 0]; - block0[ 1] = salt_buf0[ 1]; - block0[ 2] = salt_buf0[ 2]; - block0[ 3] = salt_buf0[ 3]; - block0[ 4] = salt_buf0[ 4]; - block0[ 5] = salt_buf0[ 5]; - block0[ 6] = salt_buf0[ 6]; - block0[ 7] = salt_buf0[ 7]; - block0[ 8] = salt_buf0[ 8]; - block0[ 9] = salt_buf0[ 9]; - block0[10] = salt_buf0[10]; - block0[11] = salt_buf0[11]; - block0[12] = salt_buf0[12]; - block0[13] = salt_buf0[13]; - block0[14] = salt_buf0[14]; - block0[15] = salt_buf0[15]; - block1[ 0] = salt_buf1[ 0]; - block1[ 1] = salt_buf1[ 1]; - block1[ 2] = salt_buf1[ 2]; - block1[ 3] = salt_buf1[ 3]; - block1[ 4] = salt_buf1[ 4]; - block1[ 5] = salt_buf1[ 5]; - block1[ 6] = salt_buf1[ 6]; - block1[ 7] = salt_buf1[ 7]; - block1[ 8] = salt_buf1[ 8]; - block1[ 9] = salt_buf1[ 9]; - block1[10] = salt_buf1[10]; - block1[11] = salt_buf1[11]; - block1[12] = salt_buf1[12]; - block1[13] = salt_buf1[13]; - block1[14] = salt_buf1[14]; - block1[15] = salt_buf1[15]; - - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; - - w0_t[0] = w0lr; - w0_t[1] = w0[1]; - w0_t[2] = w0[2]; - w0_t[3] = w0[3]; - w1_t[0] = w1[0]; - w1_t[1] = w1[1]; - w1_t[2] = w1[2]; - w1_t[3] = w1[3]; - w2_t[0] = w2[0]; - w2_t[1] = w2[1]; - w2_t[2] = w2[2]; - w2_t[3] = w2[3]; - w3_t[0] = w3[0]; - w3_t[1] = w3[1]; - w3_t[2] = w3[2]; - w3_t[3] = w3[3]; - - memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len); - - w0_t[0] = block0[ 0]; - w0_t[1] = block0[ 1]; - w0_t[2] = block0[ 2]; - w0_t[3] = block0[ 3]; - w1_t[0] = block0[ 4]; - w1_t[1] = block0[ 5]; - w1_t[2] = block0[ 6]; - w1_t[3] = block0[ 7]; - w2_t[0] = block0[ 8]; - w2_t[1] = block0[ 9]; - w2_t[2] = block0[10]; - w2_t[3] = block0[11]; - w3_t[0] = block0[12]; - w3_t[1] = block0[13]; - w3_t[2] = block0[14]; - w3_t[3] = block0[15]; - - // md5 - - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - w0_t[0] = block1[ 0]; - w0_t[1] = block1[ 1]; - w0_t[2] = block1[ 2]; - w0_t[3] = block1[ 3]; - w1_t[0] = block1[ 4]; - w1_t[1] = block1[ 5]; - w1_t[2] = block1[ 6]; - w1_t[3] = block1[ 7]; - w2_t[0] = block1[ 8]; - w2_t[1] = block1[ 9]; - w2_t[2] = block1[10]; - w2_t[3] = block1[11]; - w3_t[0] = block1[12]; - w3_t[1] = block1[13]; - w3_t[2] = pw_salt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - /* - * final = md5 ($HA1 . $esalt) - * we have at least 2 MD5 blocks/transformations, but we might need 3 - */ - - w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - w2_t[0] = esalt_buf0[0]; - w2_t[1] = esalt_buf0[1]; - w2_t[2] = esalt_buf0[2]; - w2_t[3] = esalt_buf0[3]; - w3_t[0] = esalt_buf0[4]; - w3_t[1] = esalt_buf0[5]; - w3_t[2] = esalt_buf0[6]; - w3_t[3] = esalt_buf0[7]; - - // md5 - // 1st transform - - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - r_a = a; - r_b = b; - r_c = c; - r_d = d; - - // 2nd transform - - w0_t[0] = esalt_buf0[ 8]; - w0_t[1] = esalt_buf0[ 9]; - w0_t[2] = esalt_buf0[10]; - w0_t[3] = esalt_buf0[11]; - w1_t[0] = esalt_buf0[12]; - w1_t[1] = esalt_buf0[13]; - w1_t[2] = esalt_buf0[14]; - w1_t[3] = esalt_buf0[15]; - w2_t[0] = esalt_buf1[ 0]; - w2_t[1] = esalt_buf1[ 1]; - w2_t[2] = esalt_buf1[ 2]; - w2_t[3] = esalt_buf1[ 3]; - w3_t[0] = esalt_buf1[ 4]; - w3_t[1] = esalt_buf1[ 5]; - w3_t[2] = digest_esalt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - COMPARE_M_SIMD (a, d, c, b); - } -} - -void m11400m_1_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *l_bin2asc) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - - /** - * salt - */ - - const u32 salt_len = esalt_bufs[digests_offset].salt_len; // not a bug, we need to get it from the esalt - - const u32 pw_salt_len = salt_len + pw_len; - - u32 salt_buf0[16]; - u32 salt_buf1[16]; - - salt_buf0[ 0] = esalt_bufs[digests_offset].salt_buf[ 0]; - salt_buf0[ 1] = esalt_bufs[digests_offset].salt_buf[ 1]; - salt_buf0[ 2] = esalt_bufs[digests_offset].salt_buf[ 2]; - salt_buf0[ 3] = esalt_bufs[digests_offset].salt_buf[ 3]; - salt_buf0[ 4] = esalt_bufs[digests_offset].salt_buf[ 4]; - salt_buf0[ 5] = esalt_bufs[digests_offset].salt_buf[ 5]; - salt_buf0[ 6] = esalt_bufs[digests_offset].salt_buf[ 6]; - salt_buf0[ 7] = esalt_bufs[digests_offset].salt_buf[ 7]; - salt_buf0[ 8] = esalt_bufs[digests_offset].salt_buf[ 8]; - salt_buf0[ 9] = esalt_bufs[digests_offset].salt_buf[ 9]; - salt_buf0[10] = esalt_bufs[digests_offset].salt_buf[10]; - salt_buf0[11] = esalt_bufs[digests_offset].salt_buf[11]; - salt_buf0[12] = esalt_bufs[digests_offset].salt_buf[12]; - salt_buf0[13] = esalt_bufs[digests_offset].salt_buf[13]; - salt_buf0[14] = esalt_bufs[digests_offset].salt_buf[14]; - salt_buf0[15] = esalt_bufs[digests_offset].salt_buf[15]; - salt_buf1[ 0] = esalt_bufs[digests_offset].salt_buf[16]; - salt_buf1[ 1] = esalt_bufs[digests_offset].salt_buf[17]; - salt_buf1[ 2] = esalt_bufs[digests_offset].salt_buf[18]; - salt_buf1[ 3] = esalt_bufs[digests_offset].salt_buf[19]; - salt_buf1[ 4] = esalt_bufs[digests_offset].salt_buf[20]; - salt_buf1[ 5] = esalt_bufs[digests_offset].salt_buf[21]; - salt_buf1[ 6] = esalt_bufs[digests_offset].salt_buf[22]; - salt_buf1[ 7] = esalt_bufs[digests_offset].salt_buf[23]; - salt_buf1[ 8] = esalt_bufs[digests_offset].salt_buf[24]; - salt_buf1[ 9] = esalt_bufs[digests_offset].salt_buf[25]; - salt_buf1[10] = esalt_bufs[digests_offset].salt_buf[26]; - salt_buf1[11] = esalt_bufs[digests_offset].salt_buf[27]; - salt_buf1[12] = esalt_bufs[digests_offset].salt_buf[28]; - salt_buf1[13] = esalt_bufs[digests_offset].salt_buf[29]; - salt_buf1[14] = 0; - salt_buf1[15] = 0; - - /** - * esalt - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - - u32 esalt_buf0[16]; - u32 esalt_buf1[16]; - u32 esalt_buf2[16]; - - esalt_buf0[ 0] = esalt_bufs[digests_offset].esalt_buf[ 0]; - esalt_buf0[ 1] = esalt_bufs[digests_offset].esalt_buf[ 1]; - esalt_buf0[ 2] = esalt_bufs[digests_offset].esalt_buf[ 2]; - esalt_buf0[ 3] = esalt_bufs[digests_offset].esalt_buf[ 3]; - esalt_buf0[ 4] = esalt_bufs[digests_offset].esalt_buf[ 4]; - esalt_buf0[ 5] = esalt_bufs[digests_offset].esalt_buf[ 5]; - esalt_buf0[ 6] = esalt_bufs[digests_offset].esalt_buf[ 6]; - esalt_buf0[ 7] = esalt_bufs[digests_offset].esalt_buf[ 7]; - esalt_buf0[ 8] = esalt_bufs[digests_offset].esalt_buf[ 8]; - esalt_buf0[ 9] = esalt_bufs[digests_offset].esalt_buf[ 9]; - esalt_buf0[10] = esalt_bufs[digests_offset].esalt_buf[10]; - esalt_buf0[11] = esalt_bufs[digests_offset].esalt_buf[11]; - esalt_buf0[12] = esalt_bufs[digests_offset].esalt_buf[12]; - esalt_buf0[13] = esalt_bufs[digests_offset].esalt_buf[13]; - esalt_buf0[14] = esalt_bufs[digests_offset].esalt_buf[14]; - esalt_buf0[15] = esalt_bufs[digests_offset].esalt_buf[15]; - esalt_buf1[ 0] = esalt_bufs[digests_offset].esalt_buf[16]; - esalt_buf1[ 1] = esalt_bufs[digests_offset].esalt_buf[17]; - esalt_buf1[ 2] = esalt_bufs[digests_offset].esalt_buf[18]; - esalt_buf1[ 3] = esalt_bufs[digests_offset].esalt_buf[19]; - esalt_buf1[ 4] = esalt_bufs[digests_offset].esalt_buf[20]; - esalt_buf1[ 5] = esalt_bufs[digests_offset].esalt_buf[21]; - esalt_buf1[ 6] = esalt_bufs[digests_offset].esalt_buf[22]; - esalt_buf1[ 7] = esalt_bufs[digests_offset].esalt_buf[23]; - esalt_buf1[ 8] = esalt_bufs[digests_offset].esalt_buf[24]; - esalt_buf1[ 9] = esalt_bufs[digests_offset].esalt_buf[25]; - esalt_buf1[10] = esalt_bufs[digests_offset].esalt_buf[26]; - esalt_buf1[11] = esalt_bufs[digests_offset].esalt_buf[27]; - esalt_buf1[12] = esalt_bufs[digests_offset].esalt_buf[28]; - esalt_buf1[13] = esalt_bufs[digests_offset].esalt_buf[29]; - esalt_buf1[14] = esalt_bufs[digests_offset].esalt_buf[30]; - esalt_buf1[15] = esalt_bufs[digests_offset].esalt_buf[31]; - esalt_buf2[ 0] = esalt_bufs[digests_offset].esalt_buf[32]; - esalt_buf2[ 1] = esalt_bufs[digests_offset].esalt_buf[33]; - esalt_buf2[ 2] = esalt_bufs[digests_offset].esalt_buf[34]; - esalt_buf2[ 3] = esalt_bufs[digests_offset].esalt_buf[35]; - esalt_buf2[ 4] = esalt_bufs[digests_offset].esalt_buf[36]; - esalt_buf2[ 5] = esalt_bufs[digests_offset].esalt_buf[37]; - esalt_buf2[ 6] = 0; - esalt_buf2[ 7] = 0; - esalt_buf2[ 8] = 0; - esalt_buf2[ 9] = 0; - esalt_buf2[10] = 0; - esalt_buf2[11] = 0; - esalt_buf2[12] = 0; - esalt_buf2[13] = 0; - esalt_buf2[14] = 0; - esalt_buf2[15] = 0; - - const u32 digest_esalt_len = 32 + esalt_len; - - /** - * loop - */ - - u32 w0l = w0[0]; - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - const u32x w0r = ix_create_bft (bfs_buf, il_pos); - - const u32x w0lr = w0l | w0r; - - /* - * HA1 = md5 ($salt . $pass) - */ - - // append the pass to the salt - - u32x block0[16]; - u32x block1[16]; - - block0[ 0] = salt_buf0[ 0]; - block0[ 1] = salt_buf0[ 1]; - block0[ 2] = salt_buf0[ 2]; - block0[ 3] = salt_buf0[ 3]; - block0[ 4] = salt_buf0[ 4]; - block0[ 5] = salt_buf0[ 5]; - block0[ 6] = salt_buf0[ 6]; - block0[ 7] = salt_buf0[ 7]; - block0[ 8] = salt_buf0[ 8]; - block0[ 9] = salt_buf0[ 9]; - block0[10] = salt_buf0[10]; - block0[11] = salt_buf0[11]; - block0[12] = salt_buf0[12]; - block0[13] = salt_buf0[13]; - block0[14] = salt_buf0[14]; - block0[15] = salt_buf0[15]; - block1[ 0] = salt_buf1[ 0]; - block1[ 1] = salt_buf1[ 1]; - block1[ 2] = salt_buf1[ 2]; - block1[ 3] = salt_buf1[ 3]; - block1[ 4] = salt_buf1[ 4]; - block1[ 5] = salt_buf1[ 5]; - block1[ 6] = salt_buf1[ 6]; - block1[ 7] = salt_buf1[ 7]; - block1[ 8] = salt_buf1[ 8]; - block1[ 9] = salt_buf1[ 9]; - block1[10] = salt_buf1[10]; - block1[11] = salt_buf1[11]; - block1[12] = salt_buf1[12]; - block1[13] = salt_buf1[13]; - block1[14] = salt_buf1[14]; - block1[15] = salt_buf1[15]; - - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; - - w0_t[0] = w0lr; - w0_t[1] = w0[1]; - w0_t[2] = w0[2]; - w0_t[3] = w0[3]; - w1_t[0] = w1[0]; - w1_t[1] = w1[1]; - w1_t[2] = w1[2]; - w1_t[3] = w1[3]; - w2_t[0] = w2[0]; - w2_t[1] = w2[1]; - w2_t[2] = w2[2]; - w2_t[3] = w2[3]; - w3_t[0] = w3[0]; - w3_t[1] = w3[1]; - w3_t[2] = w3[2]; - w3_t[3] = w3[3]; - - memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len); - - w0_t[0] = block0[ 0]; - w0_t[1] = block0[ 1]; - w0_t[2] = block0[ 2]; - w0_t[3] = block0[ 3]; - w1_t[0] = block0[ 4]; - w1_t[1] = block0[ 5]; - w1_t[2] = block0[ 6]; - w1_t[3] = block0[ 7]; - w2_t[0] = block0[ 8]; - w2_t[1] = block0[ 9]; - w2_t[2] = block0[10]; - w2_t[3] = block0[11]; - w3_t[0] = block0[12]; - w3_t[1] = block0[13]; - w3_t[2] = block0[14]; - w3_t[3] = block0[15]; - - // md5 - - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - w0_t[0] = block1[ 0]; - w0_t[1] = block1[ 1]; - w0_t[2] = block1[ 2]; - w0_t[3] = block1[ 3]; - w1_t[0] = block1[ 4]; - w1_t[1] = block1[ 5]; - w1_t[2] = block1[ 6]; - w1_t[3] = block1[ 7]; - w2_t[0] = block1[ 8]; - w2_t[1] = block1[ 9]; - w2_t[2] = block1[10]; - w2_t[3] = block1[11]; - w3_t[0] = block1[12]; - w3_t[1] = block1[13]; - w3_t[2] = pw_salt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - /* - * final = md5 ($HA1 . $esalt) - * we have at least 2 MD5 blocks/transformations, but we might need 3 - */ - - w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - w2_t[0] = esalt_buf0[0]; - w2_t[1] = esalt_buf0[1]; - w2_t[2] = esalt_buf0[2]; - w2_t[3] = esalt_buf0[3]; - w3_t[0] = esalt_buf0[4]; - w3_t[1] = esalt_buf0[5]; - w3_t[2] = esalt_buf0[6]; - w3_t[3] = esalt_buf0[7]; - - // md5 - // 1st transform - - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - r_a = a; - r_b = b; - r_c = c; - r_d = d; - - // 2nd transform - - w0_t[0] = esalt_buf0[ 8]; - w0_t[1] = esalt_buf0[ 9]; - w0_t[2] = esalt_buf0[10]; - w0_t[3] = esalt_buf0[11]; - w1_t[0] = esalt_buf0[12]; - w1_t[1] = esalt_buf0[13]; - w1_t[2] = esalt_buf0[14]; - w1_t[3] = esalt_buf0[15]; - w2_t[0] = esalt_buf1[ 0]; - w2_t[1] = esalt_buf1[ 1]; - w2_t[2] = esalt_buf1[ 2]; - w2_t[3] = esalt_buf1[ 3]; - w3_t[0] = esalt_buf1[ 4]; - w3_t[1] = esalt_buf1[ 5]; - w3_t[2] = esalt_buf1[ 6]; - w3_t[3] = esalt_buf1[ 7]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - // this is for sure the final block - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - r_a = a; - r_b = b; - r_c = c; - r_d = d; - - w0_t[0] = esalt_buf1[ 8]; - w0_t[1] = esalt_buf1[ 9]; - w0_t[2] = esalt_buf1[10]; - w0_t[3] = esalt_buf1[11]; - w1_t[0] = esalt_buf1[12]; - w1_t[1] = esalt_buf1[13]; - w1_t[2] = esalt_buf1[14]; - w1_t[3] = esalt_buf1[15]; - w2_t[0] = esalt_buf2[ 0]; - w2_t[1] = esalt_buf2[ 1]; - w2_t[2] = esalt_buf2[ 2]; - w2_t[3] = esalt_buf2[ 3]; - w3_t[0] = esalt_buf2[ 4]; - w3_t[1] = esalt_buf2[ 5]; - w3_t[2] = digest_esalt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - COMPARE_M_SIMD (a, d, c, b); - } -} - -void m11400s_0_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *l_bin2asc) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - - /** - * digest - */ - - const u32 search[4] = - { - digests_buf[digests_offset].digest_buf[DGST_R0], - digests_buf[digests_offset].digest_buf[DGST_R1], - digests_buf[digests_offset].digest_buf[DGST_R2], - digests_buf[digests_offset].digest_buf[DGST_R3] - }; - - /** - * salt - */ - - const u32 salt_len = esalt_bufs[digests_offset].salt_len; // not a bug, we need to get it from the esalt - - const u32 pw_salt_len = salt_len + pw_len; - - u32 salt_buf0[16]; - u32 salt_buf1[16]; - - salt_buf0[ 0] = esalt_bufs[digests_offset].salt_buf[ 0]; - salt_buf0[ 1] = esalt_bufs[digests_offset].salt_buf[ 1]; - salt_buf0[ 2] = esalt_bufs[digests_offset].salt_buf[ 2]; - salt_buf0[ 3] = esalt_bufs[digests_offset].salt_buf[ 3]; - salt_buf0[ 4] = esalt_bufs[digests_offset].salt_buf[ 4]; - salt_buf0[ 5] = esalt_bufs[digests_offset].salt_buf[ 5]; - salt_buf0[ 6] = esalt_bufs[digests_offset].salt_buf[ 6]; - salt_buf0[ 7] = esalt_bufs[digests_offset].salt_buf[ 7]; - salt_buf0[ 8] = esalt_bufs[digests_offset].salt_buf[ 8]; - salt_buf0[ 9] = esalt_bufs[digests_offset].salt_buf[ 9]; - salt_buf0[10] = esalt_bufs[digests_offset].salt_buf[10]; - salt_buf0[11] = esalt_bufs[digests_offset].salt_buf[11]; - salt_buf0[12] = esalt_bufs[digests_offset].salt_buf[12]; - salt_buf0[13] = esalt_bufs[digests_offset].salt_buf[13]; - salt_buf0[14] = esalt_bufs[digests_offset].salt_buf[14]; - salt_buf0[15] = esalt_bufs[digests_offset].salt_buf[15]; - salt_buf1[ 0] = esalt_bufs[digests_offset].salt_buf[16]; - salt_buf1[ 1] = esalt_bufs[digests_offset].salt_buf[17]; - salt_buf1[ 2] = esalt_bufs[digests_offset].salt_buf[18]; - salt_buf1[ 3] = esalt_bufs[digests_offset].salt_buf[19]; - salt_buf1[ 4] = esalt_bufs[digests_offset].salt_buf[20]; - salt_buf1[ 5] = esalt_bufs[digests_offset].salt_buf[21]; - salt_buf1[ 6] = esalt_bufs[digests_offset].salt_buf[22]; - salt_buf1[ 7] = esalt_bufs[digests_offset].salt_buf[23]; - salt_buf1[ 8] = esalt_bufs[digests_offset].salt_buf[24]; - salt_buf1[ 9] = esalt_bufs[digests_offset].salt_buf[25]; - salt_buf1[10] = esalt_bufs[digests_offset].salt_buf[26]; - salt_buf1[11] = esalt_bufs[digests_offset].salt_buf[27]; - salt_buf1[12] = esalt_bufs[digests_offset].salt_buf[28]; - salt_buf1[13] = esalt_bufs[digests_offset].salt_buf[29]; - salt_buf1[14] = 0; - salt_buf1[15] = 0; - - /** - * esalt - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - - u32 esalt_buf0[16]; - u32 esalt_buf1[16]; - - esalt_buf0[ 0] = esalt_bufs[digests_offset].esalt_buf[ 0]; - esalt_buf0[ 1] = esalt_bufs[digests_offset].esalt_buf[ 1]; - esalt_buf0[ 2] = esalt_bufs[digests_offset].esalt_buf[ 2]; - esalt_buf0[ 3] = esalt_bufs[digests_offset].esalt_buf[ 3]; - esalt_buf0[ 4] = esalt_bufs[digests_offset].esalt_buf[ 4]; - esalt_buf0[ 5] = esalt_bufs[digests_offset].esalt_buf[ 5]; - esalt_buf0[ 6] = esalt_bufs[digests_offset].esalt_buf[ 6]; - esalt_buf0[ 7] = esalt_bufs[digests_offset].esalt_buf[ 7]; - esalt_buf0[ 8] = esalt_bufs[digests_offset].esalt_buf[ 8]; - esalt_buf0[ 9] = esalt_bufs[digests_offset].esalt_buf[ 9]; - esalt_buf0[10] = esalt_bufs[digests_offset].esalt_buf[10]; - esalt_buf0[11] = esalt_bufs[digests_offset].esalt_buf[11]; - esalt_buf0[12] = esalt_bufs[digests_offset].esalt_buf[12]; - esalt_buf0[13] = esalt_bufs[digests_offset].esalt_buf[13]; - esalt_buf0[14] = esalt_bufs[digests_offset].esalt_buf[14]; - esalt_buf0[15] = esalt_bufs[digests_offset].esalt_buf[15]; - esalt_buf1[ 0] = esalt_bufs[digests_offset].esalt_buf[16]; - esalt_buf1[ 1] = esalt_bufs[digests_offset].esalt_buf[17]; - esalt_buf1[ 2] = esalt_bufs[digests_offset].esalt_buf[18]; - esalt_buf1[ 3] = esalt_bufs[digests_offset].esalt_buf[19]; - esalt_buf1[ 4] = esalt_bufs[digests_offset].esalt_buf[20]; - esalt_buf1[ 5] = esalt_bufs[digests_offset].esalt_buf[21]; - esalt_buf1[ 6] = esalt_bufs[digests_offset].esalt_buf[22]; - esalt_buf1[ 7] = esalt_bufs[digests_offset].esalt_buf[23]; - esalt_buf1[ 8] = esalt_bufs[digests_offset].esalt_buf[24]; - esalt_buf1[ 9] = esalt_bufs[digests_offset].esalt_buf[25]; - esalt_buf1[10] = esalt_bufs[digests_offset].esalt_buf[26]; - esalt_buf1[11] = esalt_bufs[digests_offset].esalt_buf[27]; - esalt_buf1[12] = esalt_bufs[digests_offset].esalt_buf[28]; - esalt_buf1[13] = esalt_bufs[digests_offset].esalt_buf[29]; - esalt_buf1[14] = esalt_bufs[digests_offset].esalt_buf[30]; - esalt_buf1[15] = esalt_bufs[digests_offset].esalt_buf[31]; - - const u32 digest_esalt_len = 32 + esalt_len; - - /** - * loop - */ - - u32 w0l = w0[0]; - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - const u32x w0r = ix_create_bft (bfs_buf, il_pos); - - const u32x w0lr = w0l | w0r; - - /* - * HA1 = md5 ($salt . $pass) - */ - - // append the pass to the salt - - u32x block0[16]; - u32x block1[16]; - - block0[ 0] = salt_buf0[ 0]; - block0[ 1] = salt_buf0[ 1]; - block0[ 2] = salt_buf0[ 2]; - block0[ 3] = salt_buf0[ 3]; - block0[ 4] = salt_buf0[ 4]; - block0[ 5] = salt_buf0[ 5]; - block0[ 6] = salt_buf0[ 6]; - block0[ 7] = salt_buf0[ 7]; - block0[ 8] = salt_buf0[ 8]; - block0[ 9] = salt_buf0[ 9]; - block0[10] = salt_buf0[10]; - block0[11] = salt_buf0[11]; - block0[12] = salt_buf0[12]; - block0[13] = salt_buf0[13]; - block0[14] = salt_buf0[14]; - block0[15] = salt_buf0[15]; - block1[ 0] = salt_buf1[ 0]; - block1[ 1] = salt_buf1[ 1]; - block1[ 2] = salt_buf1[ 2]; - block1[ 3] = salt_buf1[ 3]; - block1[ 4] = salt_buf1[ 4]; - block1[ 5] = salt_buf1[ 5]; - block1[ 6] = salt_buf1[ 6]; - block1[ 7] = salt_buf1[ 7]; - block1[ 8] = salt_buf1[ 8]; - block1[ 9] = salt_buf1[ 9]; - block1[10] = salt_buf1[10]; - block1[11] = salt_buf1[11]; - block1[12] = salt_buf1[12]; - block1[13] = salt_buf1[13]; - block1[14] = salt_buf1[14]; - block1[15] = salt_buf1[15]; - - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; - - w0_t[0] = w0lr; - w0_t[1] = w0[1]; - w0_t[2] = w0[2]; - w0_t[3] = w0[3]; - w1_t[0] = w1[0]; - w1_t[1] = w1[1]; - w1_t[2] = w1[2]; - w1_t[3] = w1[3]; - w2_t[0] = w2[0]; - w2_t[1] = w2[1]; - w2_t[2] = w2[2]; - w2_t[3] = w2[3]; - w3_t[0] = w3[0]; - w3_t[1] = w3[1]; - w3_t[2] = w3[2]; - w3_t[3] = w3[3]; - - memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len); - - w0_t[0] = block0[ 0]; - w0_t[1] = block0[ 1]; - w0_t[2] = block0[ 2]; - w0_t[3] = block0[ 3]; - w1_t[0] = block0[ 4]; - w1_t[1] = block0[ 5]; - w1_t[2] = block0[ 6]; - w1_t[3] = block0[ 7]; - w2_t[0] = block0[ 8]; - w2_t[1] = block0[ 9]; - w2_t[2] = block0[10]; - w2_t[3] = block0[11]; - w3_t[0] = block0[12]; - w3_t[1] = block0[13]; - w3_t[2] = pw_salt_len * 8; - w3_t[3] = 0; - - // md5 - - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - /* - * final = md5 ($HA1 . $esalt) - * we have at least 2 MD5 blocks/transformations, but we might need 3 - */ - - w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - w2_t[0] = esalt_buf0[0]; - w2_t[1] = esalt_buf0[1]; - w2_t[2] = esalt_buf0[2]; - w2_t[3] = esalt_buf0[3]; - w3_t[0] = esalt_buf0[4]; - w3_t[1] = esalt_buf0[5]; - w3_t[2] = esalt_buf0[6]; - w3_t[3] = esalt_buf0[7]; - - // md5 - // 1st transform - - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - // 2nd transform - - w0_t[0] = esalt_buf0[ 8]; - w0_t[1] = esalt_buf0[ 9]; - w0_t[2] = esalt_buf0[10]; - w0_t[3] = esalt_buf0[11]; - w1_t[0] = esalt_buf0[12]; - w1_t[1] = esalt_buf0[13]; - w1_t[2] = esalt_buf0[14]; - w1_t[3] = esalt_buf0[15]; - w2_t[0] = esalt_buf1[ 0]; - w2_t[1] = esalt_buf1[ 1]; - w2_t[2] = esalt_buf1[ 2]; - w2_t[3] = esalt_buf1[ 3]; - w3_t[0] = esalt_buf1[ 4]; - w3_t[1] = esalt_buf1[ 5]; - w3_t[2] = digest_esalt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - COMPARE_S_SIMD (a, d, c, b); - } -} - -void m11400s_0_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *l_bin2asc) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - - /** - * digest - */ - - const u32 search[4] = - { - digests_buf[digests_offset].digest_buf[DGST_R0], - digests_buf[digests_offset].digest_buf[DGST_R1], - digests_buf[digests_offset].digest_buf[DGST_R2], - digests_buf[digests_offset].digest_buf[DGST_R3] - }; - - /** - * salt - */ - - const u32 salt_len = esalt_bufs[digests_offset].salt_len; // not a bug, we need to get it from the esalt - - const u32 pw_salt_len = salt_len + pw_len; - - u32 salt_buf0[16]; - u32 salt_buf1[16]; - - salt_buf0[ 0] = esalt_bufs[digests_offset].salt_buf[ 0]; - salt_buf0[ 1] = esalt_bufs[digests_offset].salt_buf[ 1]; - salt_buf0[ 2] = esalt_bufs[digests_offset].salt_buf[ 2]; - salt_buf0[ 3] = esalt_bufs[digests_offset].salt_buf[ 3]; - salt_buf0[ 4] = esalt_bufs[digests_offset].salt_buf[ 4]; - salt_buf0[ 5] = esalt_bufs[digests_offset].salt_buf[ 5]; - salt_buf0[ 6] = esalt_bufs[digests_offset].salt_buf[ 6]; - salt_buf0[ 7] = esalt_bufs[digests_offset].salt_buf[ 7]; - salt_buf0[ 8] = esalt_bufs[digests_offset].salt_buf[ 8]; - salt_buf0[ 9] = esalt_bufs[digests_offset].salt_buf[ 9]; - salt_buf0[10] = esalt_bufs[digests_offset].salt_buf[10]; - salt_buf0[11] = esalt_bufs[digests_offset].salt_buf[11]; - salt_buf0[12] = esalt_bufs[digests_offset].salt_buf[12]; - salt_buf0[13] = esalt_bufs[digests_offset].salt_buf[13]; - salt_buf0[14] = esalt_bufs[digests_offset].salt_buf[14]; - salt_buf0[15] = esalt_bufs[digests_offset].salt_buf[15]; - salt_buf1[ 0] = esalt_bufs[digests_offset].salt_buf[16]; - salt_buf1[ 1] = esalt_bufs[digests_offset].salt_buf[17]; - salt_buf1[ 2] = esalt_bufs[digests_offset].salt_buf[18]; - salt_buf1[ 3] = esalt_bufs[digests_offset].salt_buf[19]; - salt_buf1[ 4] = esalt_bufs[digests_offset].salt_buf[20]; - salt_buf1[ 5] = esalt_bufs[digests_offset].salt_buf[21]; - salt_buf1[ 6] = esalt_bufs[digests_offset].salt_buf[22]; - salt_buf1[ 7] = esalt_bufs[digests_offset].salt_buf[23]; - salt_buf1[ 8] = esalt_bufs[digests_offset].salt_buf[24]; - salt_buf1[ 9] = esalt_bufs[digests_offset].salt_buf[25]; - salt_buf1[10] = esalt_bufs[digests_offset].salt_buf[26]; - salt_buf1[11] = esalt_bufs[digests_offset].salt_buf[27]; - salt_buf1[12] = esalt_bufs[digests_offset].salt_buf[28]; - salt_buf1[13] = esalt_bufs[digests_offset].salt_buf[29]; - salt_buf1[14] = 0; - salt_buf1[15] = 0; - - /** - * esalt - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - - u32 esalt_buf0[16]; - u32 esalt_buf1[16]; - u32 esalt_buf2[16]; - - esalt_buf0[ 0] = esalt_bufs[digests_offset].esalt_buf[ 0]; - esalt_buf0[ 1] = esalt_bufs[digests_offset].esalt_buf[ 1]; - esalt_buf0[ 2] = esalt_bufs[digests_offset].esalt_buf[ 2]; - esalt_buf0[ 3] = esalt_bufs[digests_offset].esalt_buf[ 3]; - esalt_buf0[ 4] = esalt_bufs[digests_offset].esalt_buf[ 4]; - esalt_buf0[ 5] = esalt_bufs[digests_offset].esalt_buf[ 5]; - esalt_buf0[ 6] = esalt_bufs[digests_offset].esalt_buf[ 6]; - esalt_buf0[ 7] = esalt_bufs[digests_offset].esalt_buf[ 7]; - esalt_buf0[ 8] = esalt_bufs[digests_offset].esalt_buf[ 8]; - esalt_buf0[ 9] = esalt_bufs[digests_offset].esalt_buf[ 9]; - esalt_buf0[10] = esalt_bufs[digests_offset].esalt_buf[10]; - esalt_buf0[11] = esalt_bufs[digests_offset].esalt_buf[11]; - esalt_buf0[12] = esalt_bufs[digests_offset].esalt_buf[12]; - esalt_buf0[13] = esalt_bufs[digests_offset].esalt_buf[13]; - esalt_buf0[14] = esalt_bufs[digests_offset].esalt_buf[14]; - esalt_buf0[15] = esalt_bufs[digests_offset].esalt_buf[15]; - esalt_buf1[ 0] = esalt_bufs[digests_offset].esalt_buf[16]; - esalt_buf1[ 1] = esalt_bufs[digests_offset].esalt_buf[17]; - esalt_buf1[ 2] = esalt_bufs[digests_offset].esalt_buf[18]; - esalt_buf1[ 3] = esalt_bufs[digests_offset].esalt_buf[19]; - esalt_buf1[ 4] = esalt_bufs[digests_offset].esalt_buf[20]; - esalt_buf1[ 5] = esalt_bufs[digests_offset].esalt_buf[21]; - esalt_buf1[ 6] = esalt_bufs[digests_offset].esalt_buf[22]; - esalt_buf1[ 7] = esalt_bufs[digests_offset].esalt_buf[23]; - esalt_buf1[ 8] = esalt_bufs[digests_offset].esalt_buf[24]; - esalt_buf1[ 9] = esalt_bufs[digests_offset].esalt_buf[25]; - esalt_buf1[10] = esalt_bufs[digests_offset].esalt_buf[26]; - esalt_buf1[11] = esalt_bufs[digests_offset].esalt_buf[27]; - esalt_buf1[12] = esalt_bufs[digests_offset].esalt_buf[28]; - esalt_buf1[13] = esalt_bufs[digests_offset].esalt_buf[29]; - esalt_buf1[14] = esalt_bufs[digests_offset].esalt_buf[30]; - esalt_buf1[15] = esalt_bufs[digests_offset].esalt_buf[31]; - esalt_buf2[ 0] = esalt_bufs[digests_offset].esalt_buf[32]; - esalt_buf2[ 1] = esalt_bufs[digests_offset].esalt_buf[33]; - esalt_buf2[ 2] = esalt_bufs[digests_offset].esalt_buf[34]; - esalt_buf2[ 3] = esalt_bufs[digests_offset].esalt_buf[35]; - esalt_buf2[ 4] = esalt_bufs[digests_offset].esalt_buf[36]; - esalt_buf2[ 5] = esalt_bufs[digests_offset].esalt_buf[37]; - esalt_buf2[ 6] = 0; - esalt_buf2[ 7] = 0; - esalt_buf2[ 8] = 0; - esalt_buf2[ 9] = 0; - esalt_buf2[10] = 0; - esalt_buf2[11] = 0; - esalt_buf2[12] = 0; - esalt_buf2[13] = 0; - esalt_buf2[14] = 0; - esalt_buf2[15] = 0; - - const u32 digest_esalt_len = 32 + esalt_len; - - /** - * loop - */ - - u32 w0l = w0[0]; - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - const u32x w0r = ix_create_bft (bfs_buf, il_pos); - - const u32x w0lr = w0l | w0r; - - /* - * HA1 = md5 ($salt . $pass) - */ - - // append the pass to the salt - - u32x block0[16]; - u32x block1[16]; - - block0[ 0] = salt_buf0[ 0]; - block0[ 1] = salt_buf0[ 1]; - block0[ 2] = salt_buf0[ 2]; - block0[ 3] = salt_buf0[ 3]; - block0[ 4] = salt_buf0[ 4]; - block0[ 5] = salt_buf0[ 5]; - block0[ 6] = salt_buf0[ 6]; - block0[ 7] = salt_buf0[ 7]; - block0[ 8] = salt_buf0[ 8]; - block0[ 9] = salt_buf0[ 9]; - block0[10] = salt_buf0[10]; - block0[11] = salt_buf0[11]; - block0[12] = salt_buf0[12]; - block0[13] = salt_buf0[13]; - block0[14] = salt_buf0[14]; - block0[15] = salt_buf0[15]; - block1[ 0] = salt_buf1[ 0]; - block1[ 1] = salt_buf1[ 1]; - block1[ 2] = salt_buf1[ 2]; - block1[ 3] = salt_buf1[ 3]; - block1[ 4] = salt_buf1[ 4]; - block1[ 5] = salt_buf1[ 5]; - block1[ 6] = salt_buf1[ 6]; - block1[ 7] = salt_buf1[ 7]; - block1[ 8] = salt_buf1[ 8]; - block1[ 9] = salt_buf1[ 9]; - block1[10] = salt_buf1[10]; - block1[11] = salt_buf1[11]; - block1[12] = salt_buf1[12]; - block1[13] = salt_buf1[13]; - block1[14] = salt_buf1[14]; - block1[15] = salt_buf1[15]; - - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; - - w0_t[0] = w0lr; - w0_t[1] = w0[1]; - w0_t[2] = w0[2]; - w0_t[3] = w0[3]; - w1_t[0] = w1[0]; - w1_t[1] = w1[1]; - w1_t[2] = w1[2]; - w1_t[3] = w1[3]; - w2_t[0] = w2[0]; - w2_t[1] = w2[1]; - w2_t[2] = w2[2]; - w2_t[3] = w2[3]; - w3_t[0] = w3[0]; - w3_t[1] = w3[1]; - w3_t[2] = w3[2]; - w3_t[3] = w3[3]; - - memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len); - - w0_t[0] = block0[ 0]; - w0_t[1] = block0[ 1]; - w0_t[2] = block0[ 2]; - w0_t[3] = block0[ 3]; - w1_t[0] = block0[ 4]; - w1_t[1] = block0[ 5]; - w1_t[2] = block0[ 6]; - w1_t[3] = block0[ 7]; - w2_t[0] = block0[ 8]; - w2_t[1] = block0[ 9]; - w2_t[2] = block0[10]; - w2_t[3] = block0[11]; - w3_t[0] = block0[12]; - w3_t[1] = block0[13]; - w3_t[2] = pw_salt_len * 8; - w3_t[3] = 0; - - // md5 - - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - /* - * final = md5 ($HA1 . $esalt) - * we have at least 2 MD5 blocks/transformations, but we might need 3 - */ - - w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - w2_t[0] = esalt_buf0[0]; - w2_t[1] = esalt_buf0[1]; - w2_t[2] = esalt_buf0[2]; - w2_t[3] = esalt_buf0[3]; - w3_t[0] = esalt_buf0[4]; - w3_t[1] = esalt_buf0[5]; - w3_t[2] = esalt_buf0[6]; - w3_t[3] = esalt_buf0[7]; - - // md5 - // 1st transform - - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - // 2nd transform - - w0_t[0] = esalt_buf0[ 8]; - w0_t[1] = esalt_buf0[ 9]; - w0_t[2] = esalt_buf0[10]; - w0_t[3] = esalt_buf0[11]; - w1_t[0] = esalt_buf0[12]; - w1_t[1] = esalt_buf0[13]; - w1_t[2] = esalt_buf0[14]; - w1_t[3] = esalt_buf0[15]; - w2_t[0] = esalt_buf1[ 0]; - w2_t[1] = esalt_buf1[ 1]; - w2_t[2] = esalt_buf1[ 2]; - w2_t[3] = esalt_buf1[ 3]; - w3_t[0] = esalt_buf1[ 4]; - w3_t[1] = esalt_buf1[ 5]; - w3_t[2] = esalt_buf1[ 6]; - w3_t[3] = esalt_buf1[ 7]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - // this is for sure the final block - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - r_a = a; - r_b = b; - r_c = c; - r_d = d; - - w0_t[0] = esalt_buf1[ 8]; - w0_t[1] = esalt_buf1[ 9]; - w0_t[2] = esalt_buf1[10]; - w0_t[3] = esalt_buf1[11]; - w1_t[0] = esalt_buf1[12]; - w1_t[1] = esalt_buf1[13]; - w1_t[2] = esalt_buf1[14]; - w1_t[3] = esalt_buf1[15]; - w2_t[0] = esalt_buf2[ 0]; - w2_t[1] = esalt_buf2[ 1]; - w2_t[2] = esalt_buf2[ 2]; - w2_t[3] = esalt_buf2[ 3]; - w3_t[0] = esalt_buf2[ 4]; - w3_t[1] = esalt_buf2[ 5]; - w3_t[2] = digest_esalt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - COMPARE_S_SIMD (a, d, c, b); - } -} - -void m11400s_1_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *l_bin2asc) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - - /** - * digest - */ - - const u32 search[4] = - { - digests_buf[digests_offset].digest_buf[DGST_R0], - digests_buf[digests_offset].digest_buf[DGST_R1], - digests_buf[digests_offset].digest_buf[DGST_R2], - digests_buf[digests_offset].digest_buf[DGST_R3] - }; - - /** - * salt - */ - - const u32 salt_len = esalt_bufs[digests_offset].salt_len; // not a bug, we need to get it from the esalt - - const u32 pw_salt_len = salt_len + pw_len; - - u32 salt_buf0[16]; - u32 salt_buf1[16]; - - salt_buf0[ 0] = esalt_bufs[digests_offset].salt_buf[ 0]; - salt_buf0[ 1] = esalt_bufs[digests_offset].salt_buf[ 1]; - salt_buf0[ 2] = esalt_bufs[digests_offset].salt_buf[ 2]; - salt_buf0[ 3] = esalt_bufs[digests_offset].salt_buf[ 3]; - salt_buf0[ 4] = esalt_bufs[digests_offset].salt_buf[ 4]; - salt_buf0[ 5] = esalt_bufs[digests_offset].salt_buf[ 5]; - salt_buf0[ 6] = esalt_bufs[digests_offset].salt_buf[ 6]; - salt_buf0[ 7] = esalt_bufs[digests_offset].salt_buf[ 7]; - salt_buf0[ 8] = esalt_bufs[digests_offset].salt_buf[ 8]; - salt_buf0[ 9] = esalt_bufs[digests_offset].salt_buf[ 9]; - salt_buf0[10] = esalt_bufs[digests_offset].salt_buf[10]; - salt_buf0[11] = esalt_bufs[digests_offset].salt_buf[11]; - salt_buf0[12] = esalt_bufs[digests_offset].salt_buf[12]; - salt_buf0[13] = esalt_bufs[digests_offset].salt_buf[13]; - salt_buf0[14] = esalt_bufs[digests_offset].salt_buf[14]; - salt_buf0[15] = esalt_bufs[digests_offset].salt_buf[15]; - salt_buf1[ 0] = esalt_bufs[digests_offset].salt_buf[16]; - salt_buf1[ 1] = esalt_bufs[digests_offset].salt_buf[17]; - salt_buf1[ 2] = esalt_bufs[digests_offset].salt_buf[18]; - salt_buf1[ 3] = esalt_bufs[digests_offset].salt_buf[19]; - salt_buf1[ 4] = esalt_bufs[digests_offset].salt_buf[20]; - salt_buf1[ 5] = esalt_bufs[digests_offset].salt_buf[21]; - salt_buf1[ 6] = esalt_bufs[digests_offset].salt_buf[22]; - salt_buf1[ 7] = esalt_bufs[digests_offset].salt_buf[23]; - salt_buf1[ 8] = esalt_bufs[digests_offset].salt_buf[24]; - salt_buf1[ 9] = esalt_bufs[digests_offset].salt_buf[25]; - salt_buf1[10] = esalt_bufs[digests_offset].salt_buf[26]; - salt_buf1[11] = esalt_bufs[digests_offset].salt_buf[27]; - salt_buf1[12] = esalt_bufs[digests_offset].salt_buf[28]; - salt_buf1[13] = esalt_bufs[digests_offset].salt_buf[29]; - salt_buf1[14] = 0; - salt_buf1[15] = 0; - - /** - * esalt - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - - u32 esalt_buf0[16]; - u32 esalt_buf1[16]; - - esalt_buf0[ 0] = esalt_bufs[digests_offset].esalt_buf[ 0]; - esalt_buf0[ 1] = esalt_bufs[digests_offset].esalt_buf[ 1]; - esalt_buf0[ 2] = esalt_bufs[digests_offset].esalt_buf[ 2]; - esalt_buf0[ 3] = esalt_bufs[digests_offset].esalt_buf[ 3]; - esalt_buf0[ 4] = esalt_bufs[digests_offset].esalt_buf[ 4]; - esalt_buf0[ 5] = esalt_bufs[digests_offset].esalt_buf[ 5]; - esalt_buf0[ 6] = esalt_bufs[digests_offset].esalt_buf[ 6]; - esalt_buf0[ 7] = esalt_bufs[digests_offset].esalt_buf[ 7]; - esalt_buf0[ 8] = esalt_bufs[digests_offset].esalt_buf[ 8]; - esalt_buf0[ 9] = esalt_bufs[digests_offset].esalt_buf[ 9]; - esalt_buf0[10] = esalt_bufs[digests_offset].esalt_buf[10]; - esalt_buf0[11] = esalt_bufs[digests_offset].esalt_buf[11]; - esalt_buf0[12] = esalt_bufs[digests_offset].esalt_buf[12]; - esalt_buf0[13] = esalt_bufs[digests_offset].esalt_buf[13]; - esalt_buf0[14] = esalt_bufs[digests_offset].esalt_buf[14]; - esalt_buf0[15] = esalt_bufs[digests_offset].esalt_buf[15]; - esalt_buf1[ 0] = esalt_bufs[digests_offset].esalt_buf[16]; - esalt_buf1[ 1] = esalt_bufs[digests_offset].esalt_buf[17]; - esalt_buf1[ 2] = esalt_bufs[digests_offset].esalt_buf[18]; - esalt_buf1[ 3] = esalt_bufs[digests_offset].esalt_buf[19]; - esalt_buf1[ 4] = esalt_bufs[digests_offset].esalt_buf[20]; - esalt_buf1[ 5] = esalt_bufs[digests_offset].esalt_buf[21]; - esalt_buf1[ 6] = esalt_bufs[digests_offset].esalt_buf[22]; - esalt_buf1[ 7] = esalt_bufs[digests_offset].esalt_buf[23]; - esalt_buf1[ 8] = esalt_bufs[digests_offset].esalt_buf[24]; - esalt_buf1[ 9] = esalt_bufs[digests_offset].esalt_buf[25]; - esalt_buf1[10] = esalt_bufs[digests_offset].esalt_buf[26]; - esalt_buf1[11] = esalt_bufs[digests_offset].esalt_buf[27]; - esalt_buf1[12] = esalt_bufs[digests_offset].esalt_buf[28]; - esalt_buf1[13] = esalt_bufs[digests_offset].esalt_buf[29]; - esalt_buf1[14] = esalt_bufs[digests_offset].esalt_buf[30]; - esalt_buf1[15] = esalt_bufs[digests_offset].esalt_buf[31]; - - const u32 digest_esalt_len = 32 + esalt_len; - - /** - * loop - */ - - u32 w0l = w0[0]; - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - const u32x w0r = ix_create_bft (bfs_buf, il_pos); - - const u32x w0lr = w0l | w0r; - - /* - * HA1 = md5 ($salt . $pass) - */ - - // append the pass to the salt - - u32x block0[16]; - u32x block1[16]; - - block0[ 0] = salt_buf0[ 0]; - block0[ 1] = salt_buf0[ 1]; - block0[ 2] = salt_buf0[ 2]; - block0[ 3] = salt_buf0[ 3]; - block0[ 4] = salt_buf0[ 4]; - block0[ 5] = salt_buf0[ 5]; - block0[ 6] = salt_buf0[ 6]; - block0[ 7] = salt_buf0[ 7]; - block0[ 8] = salt_buf0[ 8]; - block0[ 9] = salt_buf0[ 9]; - block0[10] = salt_buf0[10]; - block0[11] = salt_buf0[11]; - block0[12] = salt_buf0[12]; - block0[13] = salt_buf0[13]; - block0[14] = salt_buf0[14]; - block0[15] = salt_buf0[15]; - block1[ 0] = salt_buf1[ 0]; - block1[ 1] = salt_buf1[ 1]; - block1[ 2] = salt_buf1[ 2]; - block1[ 3] = salt_buf1[ 3]; - block1[ 4] = salt_buf1[ 4]; - block1[ 5] = salt_buf1[ 5]; - block1[ 6] = salt_buf1[ 6]; - block1[ 7] = salt_buf1[ 7]; - block1[ 8] = salt_buf1[ 8]; - block1[ 9] = salt_buf1[ 9]; - block1[10] = salt_buf1[10]; - block1[11] = salt_buf1[11]; - block1[12] = salt_buf1[12]; - block1[13] = salt_buf1[13]; - block1[14] = salt_buf1[14]; - block1[15] = salt_buf1[15]; - - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; - - w0_t[0] = w0lr; - w0_t[1] = w0[1]; - w0_t[2] = w0[2]; - w0_t[3] = w0[3]; - w1_t[0] = w1[0]; - w1_t[1] = w1[1]; - w1_t[2] = w1[2]; - w1_t[3] = w1[3]; - w2_t[0] = w2[0]; - w2_t[1] = w2[1]; - w2_t[2] = w2[2]; - w2_t[3] = w2[3]; - w3_t[0] = w3[0]; - w3_t[1] = w3[1]; - w3_t[2] = w3[2]; - w3_t[3] = w3[3]; - - memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len); - - w0_t[0] = block0[ 0]; - w0_t[1] = block0[ 1]; - w0_t[2] = block0[ 2]; - w0_t[3] = block0[ 3]; - w1_t[0] = block0[ 4]; - w1_t[1] = block0[ 5]; - w1_t[2] = block0[ 6]; - w1_t[3] = block0[ 7]; - w2_t[0] = block0[ 8]; - w2_t[1] = block0[ 9]; - w2_t[2] = block0[10]; - w2_t[3] = block0[11]; - w3_t[0] = block0[12]; - w3_t[1] = block0[13]; - w3_t[2] = block0[14]; - w3_t[3] = block0[15]; - - // md5 - - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - w0_t[0] = block1[ 0]; - w0_t[1] = block1[ 1]; - w0_t[2] = block1[ 2]; - w0_t[3] = block1[ 3]; - w1_t[0] = block1[ 4]; - w1_t[1] = block1[ 5]; - w1_t[2] = block1[ 6]; - w1_t[3] = block1[ 7]; - w2_t[0] = block1[ 8]; - w2_t[1] = block1[ 9]; - w2_t[2] = block1[10]; - w2_t[3] = block1[11]; - w3_t[0] = block1[12]; - w3_t[1] = block1[13]; - w3_t[2] = pw_salt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - /* - * final = md5 ($HA1 . $esalt) - * we have at least 2 MD5 blocks/transformations, but we might need 3 - */ - - w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - w2_t[0] = esalt_buf0[0]; - w2_t[1] = esalt_buf0[1]; - w2_t[2] = esalt_buf0[2]; - w2_t[3] = esalt_buf0[3]; - w3_t[0] = esalt_buf0[4]; - w3_t[1] = esalt_buf0[5]; - w3_t[2] = esalt_buf0[6]; - w3_t[3] = esalt_buf0[7]; - - // md5 - // 1st transform - - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - r_a = a; - r_b = b; - r_c = c; - r_d = d; - - // 2nd transform - - w0_t[0] = esalt_buf0[ 8]; - w0_t[1] = esalt_buf0[ 9]; - w0_t[2] = esalt_buf0[10]; - w0_t[3] = esalt_buf0[11]; - w1_t[0] = esalt_buf0[12]; - w1_t[1] = esalt_buf0[13]; - w1_t[2] = esalt_buf0[14]; - w1_t[3] = esalt_buf0[15]; - w2_t[0] = esalt_buf1[ 0]; - w2_t[1] = esalt_buf1[ 1]; - w2_t[2] = esalt_buf1[ 2]; - w2_t[3] = esalt_buf1[ 3]; - w3_t[0] = esalt_buf1[ 4]; - w3_t[1] = esalt_buf1[ 5]; - w3_t[2] = digest_esalt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - COMPARE_S_SIMD (a, d, c, b); - } -} - -void m11400s_1_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *l_bin2asc) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - - /** - * digest - */ - - const u32 search[4] = - { - digests_buf[digests_offset].digest_buf[DGST_R0], - digests_buf[digests_offset].digest_buf[DGST_R1], - digests_buf[digests_offset].digest_buf[DGST_R2], - digests_buf[digests_offset].digest_buf[DGST_R3] - }; - - /** - * salt - */ - - const u32 salt_len = esalt_bufs[digests_offset].salt_len; // not a bug, we need to get it from the esalt - - const u32 pw_salt_len = salt_len + pw_len; - - u32 salt_buf0[16]; - u32 salt_buf1[16]; - - salt_buf0[ 0] = esalt_bufs[digests_offset].salt_buf[ 0]; - salt_buf0[ 1] = esalt_bufs[digests_offset].salt_buf[ 1]; - salt_buf0[ 2] = esalt_bufs[digests_offset].salt_buf[ 2]; - salt_buf0[ 3] = esalt_bufs[digests_offset].salt_buf[ 3]; - salt_buf0[ 4] = esalt_bufs[digests_offset].salt_buf[ 4]; - salt_buf0[ 5] = esalt_bufs[digests_offset].salt_buf[ 5]; - salt_buf0[ 6] = esalt_bufs[digests_offset].salt_buf[ 6]; - salt_buf0[ 7] = esalt_bufs[digests_offset].salt_buf[ 7]; - salt_buf0[ 8] = esalt_bufs[digests_offset].salt_buf[ 8]; - salt_buf0[ 9] = esalt_bufs[digests_offset].salt_buf[ 9]; - salt_buf0[10] = esalt_bufs[digests_offset].salt_buf[10]; - salt_buf0[11] = esalt_bufs[digests_offset].salt_buf[11]; - salt_buf0[12] = esalt_bufs[digests_offset].salt_buf[12]; - salt_buf0[13] = esalt_bufs[digests_offset].salt_buf[13]; - salt_buf0[14] = esalt_bufs[digests_offset].salt_buf[14]; - salt_buf0[15] = esalt_bufs[digests_offset].salt_buf[15]; - salt_buf1[ 0] = esalt_bufs[digests_offset].salt_buf[16]; - salt_buf1[ 1] = esalt_bufs[digests_offset].salt_buf[17]; - salt_buf1[ 2] = esalt_bufs[digests_offset].salt_buf[18]; - salt_buf1[ 3] = esalt_bufs[digests_offset].salt_buf[19]; - salt_buf1[ 4] = esalt_bufs[digests_offset].salt_buf[20]; - salt_buf1[ 5] = esalt_bufs[digests_offset].salt_buf[21]; - salt_buf1[ 6] = esalt_bufs[digests_offset].salt_buf[22]; - salt_buf1[ 7] = esalt_bufs[digests_offset].salt_buf[23]; - salt_buf1[ 8] = esalt_bufs[digests_offset].salt_buf[24]; - salt_buf1[ 9] = esalt_bufs[digests_offset].salt_buf[25]; - salt_buf1[10] = esalt_bufs[digests_offset].salt_buf[26]; - salt_buf1[11] = esalt_bufs[digests_offset].salt_buf[27]; - salt_buf1[12] = esalt_bufs[digests_offset].salt_buf[28]; - salt_buf1[13] = esalt_bufs[digests_offset].salt_buf[29]; - salt_buf1[14] = 0; - salt_buf1[15] = 0; - - /** - * esalt - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - - u32 esalt_buf0[16]; - u32 esalt_buf1[16]; - u32 esalt_buf2[16]; - - esalt_buf0[ 0] = esalt_bufs[digests_offset].esalt_buf[ 0]; - esalt_buf0[ 1] = esalt_bufs[digests_offset].esalt_buf[ 1]; - esalt_buf0[ 2] = esalt_bufs[digests_offset].esalt_buf[ 2]; - esalt_buf0[ 3] = esalt_bufs[digests_offset].esalt_buf[ 3]; - esalt_buf0[ 4] = esalt_bufs[digests_offset].esalt_buf[ 4]; - esalt_buf0[ 5] = esalt_bufs[digests_offset].esalt_buf[ 5]; - esalt_buf0[ 6] = esalt_bufs[digests_offset].esalt_buf[ 6]; - esalt_buf0[ 7] = esalt_bufs[digests_offset].esalt_buf[ 7]; - esalt_buf0[ 8] = esalt_bufs[digests_offset].esalt_buf[ 8]; - esalt_buf0[ 9] = esalt_bufs[digests_offset].esalt_buf[ 9]; - esalt_buf0[10] = esalt_bufs[digests_offset].esalt_buf[10]; - esalt_buf0[11] = esalt_bufs[digests_offset].esalt_buf[11]; - esalt_buf0[12] = esalt_bufs[digests_offset].esalt_buf[12]; - esalt_buf0[13] = esalt_bufs[digests_offset].esalt_buf[13]; - esalt_buf0[14] = esalt_bufs[digests_offset].esalt_buf[14]; - esalt_buf0[15] = esalt_bufs[digests_offset].esalt_buf[15]; - esalt_buf1[ 0] = esalt_bufs[digests_offset].esalt_buf[16]; - esalt_buf1[ 1] = esalt_bufs[digests_offset].esalt_buf[17]; - esalt_buf1[ 2] = esalt_bufs[digests_offset].esalt_buf[18]; - esalt_buf1[ 3] = esalt_bufs[digests_offset].esalt_buf[19]; - esalt_buf1[ 4] = esalt_bufs[digests_offset].esalt_buf[20]; - esalt_buf1[ 5] = esalt_bufs[digests_offset].esalt_buf[21]; - esalt_buf1[ 6] = esalt_bufs[digests_offset].esalt_buf[22]; - esalt_buf1[ 7] = esalt_bufs[digests_offset].esalt_buf[23]; - esalt_buf1[ 8] = esalt_bufs[digests_offset].esalt_buf[24]; - esalt_buf1[ 9] = esalt_bufs[digests_offset].esalt_buf[25]; - esalt_buf1[10] = esalt_bufs[digests_offset].esalt_buf[26]; - esalt_buf1[11] = esalt_bufs[digests_offset].esalt_buf[27]; - esalt_buf1[12] = esalt_bufs[digests_offset].esalt_buf[28]; - esalt_buf1[13] = esalt_bufs[digests_offset].esalt_buf[29]; - esalt_buf1[14] = esalt_bufs[digests_offset].esalt_buf[30]; - esalt_buf1[15] = esalt_bufs[digests_offset].esalt_buf[31]; - esalt_buf2[ 0] = esalt_bufs[digests_offset].esalt_buf[32]; - esalt_buf2[ 1] = esalt_bufs[digests_offset].esalt_buf[33]; - esalt_buf2[ 2] = esalt_bufs[digests_offset].esalt_buf[34]; - esalt_buf2[ 3] = esalt_bufs[digests_offset].esalt_buf[35]; - esalt_buf2[ 4] = esalt_bufs[digests_offset].esalt_buf[36]; - esalt_buf2[ 5] = esalt_bufs[digests_offset].esalt_buf[37]; - esalt_buf2[ 6] = 0; - esalt_buf2[ 7] = 0; - esalt_buf2[ 8] = 0; - esalt_buf2[ 9] = 0; - esalt_buf2[10] = 0; - esalt_buf2[11] = 0; - esalt_buf2[12] = 0; - esalt_buf2[13] = 0; - esalt_buf2[14] = 0; - esalt_buf2[15] = 0; - - const u32 digest_esalt_len = 32 + esalt_len; - - /** - * loop - */ - - u32 w0l = w0[0]; - - for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) - { - const u32x w0r = ix_create_bft (bfs_buf, il_pos); - - const u32x w0lr = w0l | w0r; - - /* - * HA1 = md5 ($salt . $pass) - */ - - // append the pass to the salt - - u32x block0[16]; - u32x block1[16]; - - block0[ 0] = salt_buf0[ 0]; - block0[ 1] = salt_buf0[ 1]; - block0[ 2] = salt_buf0[ 2]; - block0[ 3] = salt_buf0[ 3]; - block0[ 4] = salt_buf0[ 4]; - block0[ 5] = salt_buf0[ 5]; - block0[ 6] = salt_buf0[ 6]; - block0[ 7] = salt_buf0[ 7]; - block0[ 8] = salt_buf0[ 8]; - block0[ 9] = salt_buf0[ 9]; - block0[10] = salt_buf0[10]; - block0[11] = salt_buf0[11]; - block0[12] = salt_buf0[12]; - block0[13] = salt_buf0[13]; - block0[14] = salt_buf0[14]; - block0[15] = salt_buf0[15]; - block1[ 0] = salt_buf1[ 0]; - block1[ 1] = salt_buf1[ 1]; - block1[ 2] = salt_buf1[ 2]; - block1[ 3] = salt_buf1[ 3]; - block1[ 4] = salt_buf1[ 4]; - block1[ 5] = salt_buf1[ 5]; - block1[ 6] = salt_buf1[ 6]; - block1[ 7] = salt_buf1[ 7]; - block1[ 8] = salt_buf1[ 8]; - block1[ 9] = salt_buf1[ 9]; - block1[10] = salt_buf1[10]; - block1[11] = salt_buf1[11]; - block1[12] = salt_buf1[12]; - block1[13] = salt_buf1[13]; - block1[14] = salt_buf1[14]; - block1[15] = salt_buf1[15]; - - u32x w0_t[4]; - u32x w1_t[4]; - u32x w2_t[4]; - u32x w3_t[4]; - - w0_t[0] = w0lr; - w0_t[1] = w0[1]; - w0_t[2] = w0[2]; - w0_t[3] = w0[3]; - w1_t[0] = w1[0]; - w1_t[1] = w1[1]; - w1_t[2] = w1[2]; - w1_t[3] = w1[3]; - w2_t[0] = w2[0]; - w2_t[1] = w2[1]; - w2_t[2] = w2[2]; - w2_t[3] = w2[3]; - w3_t[0] = w3[0]; - w3_t[1] = w3[1]; - w3_t[2] = w3[2]; - w3_t[3] = w3[3]; - - memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len); - - w0_t[0] = block0[ 0]; - w0_t[1] = block0[ 1]; - w0_t[2] = block0[ 2]; - w0_t[3] = block0[ 3]; - w1_t[0] = block0[ 4]; - w1_t[1] = block0[ 5]; - w1_t[2] = block0[ 6]; - w1_t[3] = block0[ 7]; - w2_t[0] = block0[ 8]; - w2_t[1] = block0[ 9]; - w2_t[2] = block0[10]; - w2_t[3] = block0[11]; - w3_t[0] = block0[12]; - w3_t[1] = block0[13]; - w3_t[2] = block0[14]; - w3_t[3] = block0[15]; - - // md5 - - u32x a = MD5M_A; - u32x b = MD5M_B; - u32x c = MD5M_C; - u32x d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - u32x r_a = a; - u32x r_b = b; - u32x r_c = c; - u32x r_d = d; - - w0_t[0] = block1[ 0]; - w0_t[1] = block1[ 1]; - w0_t[2] = block1[ 2]; - w0_t[3] = block1[ 3]; - w1_t[0] = block1[ 4]; - w1_t[1] = block1[ 5]; - w1_t[2] = block1[ 6]; - w1_t[3] = block1[ 7]; - w2_t[0] = block1[ 8]; - w2_t[1] = block1[ 9]; - w2_t[2] = block1[10]; - w2_t[3] = block1[11]; - w3_t[0] = block1[12]; - w3_t[1] = block1[13]; - w3_t[2] = pw_salt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - /* - * final = md5 ($HA1 . $esalt) - * we have at least 2 MD5 blocks/transformations, but we might need 3 - */ - - w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - w2_t[0] = esalt_buf0[0]; - w2_t[1] = esalt_buf0[1]; - w2_t[2] = esalt_buf0[2]; - w2_t[3] = esalt_buf0[3]; - w3_t[0] = esalt_buf0[4]; - w3_t[1] = esalt_buf0[5]; - w3_t[2] = esalt_buf0[6]; - w3_t[3] = esalt_buf0[7]; - - // md5 - // 1st transform - - a = MD5M_A; - b = MD5M_B; - c = MD5M_C; - d = MD5M_D; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += MD5M_A; - b += MD5M_B; - c += MD5M_C; - d += MD5M_D; - - r_a = a; - r_b = b; - r_c = c; - r_d = d; - - // 2nd transform - - w0_t[0] = esalt_buf0[ 8]; - w0_t[1] = esalt_buf0[ 9]; - w0_t[2] = esalt_buf0[10]; - w0_t[3] = esalt_buf0[11]; - w1_t[0] = esalt_buf0[12]; - w1_t[1] = esalt_buf0[13]; - w1_t[2] = esalt_buf0[14]; - w1_t[3] = esalt_buf0[15]; - w2_t[0] = esalt_buf1[ 0]; - w2_t[1] = esalt_buf1[ 1]; - w2_t[2] = esalt_buf1[ 2]; - w2_t[3] = esalt_buf1[ 3]; - w3_t[0] = esalt_buf1[ 4]; - w3_t[1] = esalt_buf1[ 5]; - w3_t[2] = esalt_buf1[ 6]; - w3_t[3] = esalt_buf1[ 7]; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - // this is for sure the final block - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - r_a = a; - r_b = b; - r_c = c; - r_d = d; - - w0_t[0] = esalt_buf1[ 8]; - w0_t[1] = esalt_buf1[ 9]; - w0_t[2] = esalt_buf1[10]; - w0_t[3] = esalt_buf1[11]; - w1_t[0] = esalt_buf1[12]; - w1_t[1] = esalt_buf1[13]; - w1_t[2] = esalt_buf1[14]; - w1_t[3] = esalt_buf1[15]; - w2_t[0] = esalt_buf2[ 0]; - w2_t[1] = esalt_buf2[ 1]; - w2_t[2] = esalt_buf2[ 2]; - w2_t[3] = esalt_buf2[ 3]; - w3_t[0] = esalt_buf2[ 4]; - w3_t[1] = esalt_buf2[ 5]; - w3_t[2] = digest_esalt_len * 8; - w3_t[3] = 0; - - MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - - MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - - MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - a += r_a; - b += r_b; - c += r_c; - d += r_d; - - COMPARE_S_SIMD (a, d, c, b); - } -} - -__kernel void m11400_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - const u32 lsz = get_local_size (0); - - /** - * bin2asc table - */ - - __local u32 l_bin2asc[256]; - - for (u32 i = lid; i < 256; i += lsz) - { - const u32 i0 = (i >> 0) & 15; - const u32 i1 = (i >> 4) & 15; - - l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 - | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; - } - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - - /** - * modifier - */ - - u32 w0[4]; - - w0[0] = pws[gid].i[ 0]; - w0[1] = pws[gid].i[ 1]; - w0[2] = pws[gid].i[ 2]; - w0[3] = pws[gid].i[ 3]; - - u32 w1[4]; - - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - - u32 w2[4]; - - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - - u32 w3[4]; - - w3[0] = 0; - w3[1] = 0; - w3[2] = pws[gid].i[14]; - w3[3] = 0; - - const u32 pw_len = pws[gid].pw_len; - - /** - * main - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - const u32 salt_len = esalt_bufs[digests_offset].salt_len; - - const u32 sw_1 = ((32 + esalt_len + 1) > 119); - const u32 sw_2 = ((pw_len + salt_len) > 55) << 1; - - switch (sw_1 | sw_2) - { - case 0: - m11400m_0_0 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 1: - m11400m_0_1 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 2: - m11400m_1_0 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 3: - m11400m_1_1 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - } -} - -__kernel void m11400_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - const u32 lsz = get_local_size (0); - - /** - * bin2asc table - */ - - __local u32 l_bin2asc[256]; - - for (u32 i = lid; i < 256; i += lsz) - { - const u32 i0 = (i >> 0) & 15; - const u32 i1 = (i >> 4) & 15; - - l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 - | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; - } - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - - /** - * modifier - */ - - u32 w0[4]; - - w0[0] = pws[gid].i[ 0]; - w0[1] = pws[gid].i[ 1]; - w0[2] = pws[gid].i[ 2]; - w0[3] = pws[gid].i[ 3]; - - u32 w1[4]; - - w1[0] = pws[gid].i[ 4]; - w1[1] = pws[gid].i[ 5]; - w1[2] = pws[gid].i[ 6]; - w1[3] = pws[gid].i[ 7]; - - u32 w2[4]; - - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - - u32 w3[4]; - - w3[0] = 0; - w3[1] = 0; - w3[2] = pws[gid].i[14]; - w3[3] = 0; - - const u32 pw_len = pws[gid].pw_len; - - /** - * main - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - const u32 salt_len = esalt_bufs[digests_offset].salt_len; - - const u32 sw_1 = ((32 + esalt_len + 1) > 119); - const u32 sw_2 = ((pw_len + salt_len) > 55) << 1; - - switch (sw_1 | sw_2) - { - case 0: - m11400m_0_0 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 1: - m11400m_0_1 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 2: - m11400m_1_0 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 3: - m11400m_1_1 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - } -} - -__kernel void m11400_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - const u32 lsz = get_local_size (0); - - /** - * bin2asc table - */ - - __local u32 l_bin2asc[256]; - - for (u32 i = lid; i < 256; i += lsz) - { - const u32 i0 = (i >> 0) & 15; - const u32 i1 = (i >> 4) & 15; - - l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 - | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; - } - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - - /** - * modifier - */ - - u32 w0[4]; - - w0[0] = pws[gid].i[ 0]; - w0[1] = pws[gid].i[ 1]; - w0[2] = pws[gid].i[ 2]; - w0[3] = pws[gid].i[ 3]; - - u32 w1[4]; - - w1[0] = pws[gid].i[ 4]; - w1[1] = pws[gid].i[ 5]; - w1[2] = pws[gid].i[ 6]; - w1[3] = pws[gid].i[ 7]; - - u32 w2[4]; - - w2[0] = pws[gid].i[ 8]; - w2[1] = pws[gid].i[ 9]; - w2[2] = pws[gid].i[10]; - w2[3] = pws[gid].i[11]; - - u32 w3[4]; - - w3[0] = pws[gid].i[12]; - w3[1] = pws[gid].i[13]; - w3[2] = pws[gid].i[14]; - w3[3] = pws[gid].i[15]; - - const u32 pw_len = pws[gid].pw_len; - - /** - * main - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - const u32 salt_len = esalt_bufs[digests_offset].salt_len; - - const u32 sw_1 = ((32 + esalt_len + 1) > 119); - const u32 sw_2 = ((pw_len + salt_len) > 55) << 1; - - switch (sw_1 | sw_2) - { - case 0: - m11400m_0_0 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 1: - m11400m_0_1 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 2: - m11400m_1_0 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 3: - m11400m_1_1 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - } -} - -__kernel void m11400_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - const u32 lsz = get_local_size (0); - - /** - * bin2asc table - */ - - __local u32 l_bin2asc[256]; - - for (u32 i = lid; i < 256; i += lsz) - { - const u32 i0 = (i >> 0) & 15; - const u32 i1 = (i >> 4) & 15; - - l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 - | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; - } - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - - /** - * modifier - */ - - u32 w0[4]; - - w0[0] = pws[gid].i[ 0]; - w0[1] = pws[gid].i[ 1]; - w0[2] = pws[gid].i[ 2]; - w0[3] = pws[gid].i[ 3]; - - u32 w1[4]; - - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - - u32 w2[4]; - - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - - u32 w3[4]; - - w3[0] = 0; - w3[1] = 0; - w3[2] = pws[gid].i[14]; - w3[3] = 0; - - const u32 pw_len = pws[gid].pw_len; - - /** - * main - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - const u32 salt_len = esalt_bufs[digests_offset].salt_len; - - const u32 sw_1 = ((32 + esalt_len + 1) > 119); - const u32 sw_2 = ((pw_len + salt_len) > 55) << 1; - - switch (sw_1 | sw_2) - { - case 0: - m11400s_0_0 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 1: - m11400s_0_1 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 2: - m11400s_1_0 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 3: - m11400s_1_1 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - } -} - -__kernel void m11400_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - const u32 lsz = get_local_size (0); - - /** - * bin2asc table - */ - - __local u32 l_bin2asc[256]; - - for (u32 i = lid; i < 256; i += lsz) - { - const u32 i0 = (i >> 0) & 15; - const u32 i1 = (i >> 4) & 15; - - l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 - | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; - } - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - - /** - * modifier - */ - - u32 w0[4]; - - w0[0] = pws[gid].i[ 0]; - w0[1] = pws[gid].i[ 1]; - w0[2] = pws[gid].i[ 2]; - w0[3] = pws[gid].i[ 3]; - - u32 w1[4]; - - w1[0] = pws[gid].i[ 4]; - w1[1] = pws[gid].i[ 5]; - w1[2] = pws[gid].i[ 6]; - w1[3] = pws[gid].i[ 7]; - - u32 w2[4]; - - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - - u32 w3[4]; - - w3[0] = 0; - w3[1] = 0; - w3[2] = pws[gid].i[14]; - w3[3] = 0; - - const u32 pw_len = pws[gid].pw_len; - - /** - * main - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - const u32 salt_len = esalt_bufs[digests_offset].salt_len; - - const u32 sw_1 = ((32 + esalt_len + 1) > 119); - const u32 sw_2 = ((pw_len + salt_len) > 55) << 1; - - switch (sw_1 | sw_2) - { - case 0: - m11400s_0_0 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 1: - m11400s_0_1 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 2: - m11400s_1_0 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 3: - m11400s_1_1 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - } -} - -__kernel void m11400_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) -{ - /** - * modifier - */ - - const u32 gid = get_global_id (0); - const u32 lid = get_local_id (0); - const u32 lsz = get_local_size (0); - - /** - * bin2asc table - */ - - __local u32 l_bin2asc[256]; - - for (u32 i = lid; i < 256; i += lsz) - { - const u32 i0 = (i >> 0) & 15; - const u32 i1 = (i >> 4) & 15; - - l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 - | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; - } - - barrier (CLK_LOCAL_MEM_FENCE); - - if (gid >= gid_max) return; - - /** - * modifier - */ - - u32 w0[4]; - - w0[0] = pws[gid].i[ 0]; - w0[1] = pws[gid].i[ 1]; - w0[2] = pws[gid].i[ 2]; - w0[3] = pws[gid].i[ 3]; - - u32 w1[4]; - - w1[0] = pws[gid].i[ 4]; - w1[1] = pws[gid].i[ 5]; - w1[2] = pws[gid].i[ 6]; - w1[3] = pws[gid].i[ 7]; - - u32 w2[4]; - - w2[0] = pws[gid].i[ 8]; - w2[1] = pws[gid].i[ 9]; - w2[2] = pws[gid].i[10]; - w2[3] = pws[gid].i[11]; - - u32 w3[4]; - - w3[0] = pws[gid].i[12]; - w3[1] = pws[gid].i[13]; - w3[2] = pws[gid].i[14]; - w3[3] = pws[gid].i[15]; - - const u32 pw_len = pws[gid].pw_len; - - /** - * main - */ - - const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; - const u32 salt_len = esalt_bufs[digests_offset].salt_len; - - const u32 sw_1 = ((32 + esalt_len + 1) > 119); - const u32 sw_2 = ((pw_len + salt_len) > 55) << 1; - - switch (sw_1 | sw_2) - { - case 0: - m11400s_0_0 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 1: - m11400s_0_1 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 2: - m11400s_1_0 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - case 3: - m11400s_1_1 (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset, l_bin2asc); - break; - } -} diff --git a/OpenCL/m11400_a3.cl b/OpenCL/m11400_a3.cl new file mode 100644 index 000000000..ec568abd5 --- /dev/null +++ b/OpenCL/m11400_a3.cl @@ -0,0 +1,294 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_md5.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m11400_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; + + const u32 esalt_lenv = ceil ((float) esalt_len / 4); + + u32x esalt_buf[64] = { 0 }; + + for (int idx = 0; idx < esalt_lenv; idx++) + { + esalt_buf[idx] = esalt_bufs[digests_offset].esalt_buf[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, esalt_bufs[digests_offset].salt_buf, esalt_bufs[digests_offset].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md5_ctx_vector_t ctx1; + + md5_init_vector_from_scalar (&ctx1, &ctx0); + + md5_update_vector (&ctx1, w, pw_len); + + md5_final_vector (&ctx1); + + const u32x a = ctx1.h[0]; + const u32x b = ctx1.h[1]; + const u32x c = ctx1.h[2]; + const u32x d = ctx1.h[3]; + + md5_ctx_vector_t ctx; + + md5_init_vector (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + + ctx.len = 32; + + md5_update_vector (&ctx, esalt_buf, esalt_len); + + md5_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m11400_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const sip_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8 + | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; + + const u32 esalt_lenv = ceil ((float) esalt_len / 4); + + u32x esalt_buf[64] = { 0 }; + + for (int idx = 0; idx < esalt_lenv; idx++) + { + esalt_buf[idx] = esalt_bufs[digests_offset].esalt_buf[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + md5_ctx_t ctx0; + + md5_init (&ctx0); + + md5_update_global (&ctx0, esalt_bufs[digests_offset].salt_buf, esalt_bufs[digests_offset].salt_len); + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md5_ctx_vector_t ctx1; + + md5_init_vector_from_scalar (&ctx1, &ctx0); + + md5_update_vector (&ctx1, w, pw_len); + + md5_final_vector (&ctx1); + + const u32x a = ctx1.h[0]; + const u32x b = ctx1.h[1]; + const u32x c = ctx1.h[2]; + const u32x d = ctx1.h[3]; + + md5_ctx_vector_t ctx; + + md5_init_vector (&ctx); + + ctx.w0[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + + ctx.len = 32; + + md5_update_vector (&ctx, esalt_buf, esalt_len); + + md5_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} diff --git a/include/interface.h b/include/interface.h index 0783c5e7c..ed91ea899 100644 --- a/include/interface.h +++ b/include/interface.h @@ -211,10 +211,10 @@ typedef struct bitcoin_wallet typedef struct sip { - u32 salt_buf[30]; + u32 salt_buf[32]; u32 salt_len; - u32 esalt_buf[38]; + u32 esalt_buf[48]; u32 esalt_len; } sip_t; From 51128473bc0461880f3a123416a4067af0a2b3d5 Mon Sep 17 00:00:00 2001 From: jsteube Date: Mon, 7 Aug 2017 14:22:15 +0200 Subject: [PATCH 69/75] Add pure kernels for ColdFusion 10+ --- OpenCL/m12600_a0.cl | 332 ++++++++++++++++++++++++++++++++++++++++++ OpenCL/m12600_a1.cl | 307 +++++++++++++++++++++++++++++++++++++++ OpenCL/m12600_a3.cl | 341 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 980 insertions(+) create mode 100644 OpenCL/m12600_a0.cl create mode 100644 OpenCL/m12600_a1.cl create mode 100644 OpenCL/m12600_a3.cl diff --git a/OpenCL/m12600_a0.cl b/OpenCL/m12600_a0.cl new file mode 100644 index 000000000..79da6bd9d --- /dev/null +++ b/OpenCL/m12600_a0.cl @@ -0,0 +1,332 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" +#include "inc_hash_sha256.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m12600_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'A' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * salt + */ + + u32 pc256[8]; + + pc256[0] = salt_bufs[salt_pos].salt_buf_pc[0]; + pc256[1] = salt_bufs[salt_pos].salt_buf_pc[1]; + pc256[2] = salt_bufs[salt_pos].salt_buf_pc[2]; + pc256[3] = salt_bufs[salt_pos].salt_buf_pc[3]; + pc256[4] = salt_bufs[salt_pos].salt_buf_pc[4]; + pc256[5] = salt_bufs[salt_pos].salt_buf_pc[5]; + pc256[6] = salt_bufs[salt_pos].salt_buf_pc[6]; + pc256[7] = salt_bufs[salt_pos].salt_buf_pc[7]; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_swap (&ctx0, w, pw_len); + + sha1_final (&ctx0); + + const u32 a = ctx0.h[0]; + const u32 b = ctx0.h[1]; + const u32 c = ctx0.h[2]; + const u32 d = ctx0.h[3]; + const u32 e = ctx0.h[4]; + + sha256_ctx_t ctx; + + ctx.h[0] = pc256[0]; + ctx.h[1] = pc256[1]; + ctx.h[2] = pc256[2]; + ctx.h[3] = pc256[3]; + ctx.h[4] = pc256[4]; + ctx.h[5] = pc256[5]; + ctx.h[6] = pc256[6]; + ctx.h[7] = pc256[7]; + + ctx.len = 64; + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + ctx.w2[2] = 0; + ctx.w2[3] = 0; + ctx.w3[0] = 0; + ctx.w3[1] = 0; + ctx.w3[2] = 0; + ctx.w3[3] = 0; + + ctx.len += 40; + + sha256_final (&ctx); + + ctx.h[0] -= pc256[0]; + ctx.h[1] -= pc256[1]; + ctx.h[2] -= pc256[2]; + ctx.h[3] -= pc256[3]; + ctx.h[4] -= pc256[4]; + ctx.h[5] -= pc256[5]; + ctx.h[6] -= pc256[6]; + ctx.h[7] -= pc256[7]; + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + + +__kernel void m12600_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'A' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * salt + */ + + u32 pc256[8]; + + pc256[0] = salt_bufs[salt_pos].salt_buf_pc[0]; + pc256[1] = salt_bufs[salt_pos].salt_buf_pc[1]; + pc256[2] = salt_bufs[salt_pos].salt_buf_pc[2]; + pc256[3] = salt_bufs[salt_pos].salt_buf_pc[3]; + pc256[4] = salt_bufs[salt_pos].salt_buf_pc[4]; + pc256[5] = salt_bufs[salt_pos].salt_buf_pc[5]; + pc256[6] = salt_bufs[salt_pos].salt_buf_pc[6]; + pc256[7] = salt_bufs[salt_pos].salt_buf_pc[7]; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_swap (&ctx0, w, pw_len); + + sha1_final (&ctx0); + + const u32 a = ctx0.h[0]; + const u32 b = ctx0.h[1]; + const u32 c = ctx0.h[2]; + const u32 d = ctx0.h[3]; + const u32 e = ctx0.h[4]; + + sha256_ctx_t ctx; + + ctx.h[0] = pc256[0]; + ctx.h[1] = pc256[1]; + ctx.h[2] = pc256[2]; + ctx.h[3] = pc256[3]; + ctx.h[4] = pc256[4]; + ctx.h[5] = pc256[5]; + ctx.h[6] = pc256[6]; + ctx.h[7] = pc256[7]; + + ctx.len = 64; + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + ctx.w2[2] = 0; + ctx.w2[3] = 0; + ctx.w3[0] = 0; + ctx.w3[1] = 0; + ctx.w3[2] = 0; + ctx.w3[3] = 0; + + ctx.len += 40; + + sha256_final (&ctx); + + ctx.h[0] -= pc256[0]; + ctx.h[1] -= pc256[1]; + ctx.h[2] -= pc256[2]; + ctx.h[3] -= pc256[3]; + ctx.h[4] -= pc256[4]; + ctx.h[5] -= pc256[5]; + ctx.h[6] -= pc256[6]; + ctx.h[7] -= pc256[7]; + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m12600_a1.cl b/OpenCL/m12600_a1.cl new file mode 100644 index 000000000..d20b3ae90 --- /dev/null +++ b/OpenCL/m12600_a1.cl @@ -0,0 +1,307 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" +#include "inc_hash_sha256.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m12600_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'A' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * salt + */ + + u32 pc256[8]; + + pc256[0] = salt_bufs[salt_pos].salt_buf_pc[0]; + pc256[1] = salt_bufs[salt_pos].salt_buf_pc[1]; + pc256[2] = salt_bufs[salt_pos].salt_buf_pc[2]; + pc256[3] = salt_bufs[salt_pos].salt_buf_pc[3]; + pc256[4] = salt_bufs[salt_pos].salt_buf_pc[4]; + pc256[5] = salt_bufs[salt_pos].salt_buf_pc[5]; + pc256[6] = salt_bufs[salt_pos].salt_buf_pc[6]; + pc256[7] = salt_bufs[salt_pos].salt_buf_pc[7]; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx1 = ctx0; + + sha1_update_global_swap (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + const u32 e = ctx1.h[4]; + + sha256_ctx_t ctx; + + ctx.h[0] = pc256[0]; + ctx.h[1] = pc256[1]; + ctx.h[2] = pc256[2]; + ctx.h[3] = pc256[3]; + ctx.h[4] = pc256[4]; + ctx.h[5] = pc256[5]; + ctx.h[6] = pc256[6]; + ctx.h[7] = pc256[7]; + + ctx.len = 64; + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + ctx.w2[2] = 0; + ctx.w2[3] = 0; + ctx.w3[0] = 0; + ctx.w3[1] = 0; + ctx.w3[2] = 0; + ctx.w3[3] = 0; + + ctx.len += 40; + + sha256_final (&ctx); + + ctx.h[0] -= pc256[0]; + ctx.h[1] -= pc256[1]; + ctx.h[2] -= pc256[2]; + ctx.h[3] -= pc256[3]; + ctx.h[4] -= pc256[4]; + ctx.h[5] -= pc256[5]; + ctx.h[6] -= pc256[6]; + ctx.h[7] -= pc256[7]; + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m12600_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'A' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * salt + */ + + u32 pc256[8]; + + pc256[0] = salt_bufs[salt_pos].salt_buf_pc[0]; + pc256[1] = salt_bufs[salt_pos].salt_buf_pc[1]; + pc256[2] = salt_bufs[salt_pos].salt_buf_pc[2]; + pc256[3] = salt_bufs[salt_pos].salt_buf_pc[3]; + pc256[4] = salt_bufs[salt_pos].salt_buf_pc[4]; + pc256[5] = salt_bufs[salt_pos].salt_buf_pc[5]; + pc256[6] = salt_bufs[salt_pos].salt_buf_pc[6]; + pc256[7] = salt_bufs[salt_pos].salt_buf_pc[7]; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx1 = ctx0; + + sha1_update_global_swap (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx1); + + const u32 a = ctx1.h[0]; + const u32 b = ctx1.h[1]; + const u32 c = ctx1.h[2]; + const u32 d = ctx1.h[3]; + const u32 e = ctx1.h[4]; + + sha256_ctx_t ctx; + + ctx.h[0] = pc256[0]; + ctx.h[1] = pc256[1]; + ctx.h[2] = pc256[2]; + ctx.h[3] = pc256[3]; + ctx.h[4] = pc256[4]; + ctx.h[5] = pc256[5]; + ctx.h[6] = pc256[6]; + ctx.h[7] = pc256[7]; + + ctx.len = 64; + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + ctx.w2[2] = 0; + ctx.w2[3] = 0; + ctx.w3[0] = 0; + ctx.w3[1] = 0; + ctx.w3[2] = 0; + ctx.w3[3] = 0; + + ctx.len += 40; + + sha256_final (&ctx); + + ctx.h[0] -= pc256[0]; + ctx.h[1] -= pc256[1]; + ctx.h[2] -= pc256[2]; + ctx.h[3] -= pc256[3]; + ctx.h[4] -= pc256[4]; + ctx.h[5] -= pc256[5]; + ctx.h[6] -= pc256[6]; + ctx.h[7] -= pc256[7]; + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m12600_a3.cl b/OpenCL/m12600_a3.cl new file mode 100644 index 000000000..be4ba4eb2 --- /dev/null +++ b/OpenCL/m12600_a3.cl @@ -0,0 +1,341 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_sha1.cl" +#include "inc_hash_sha256.cl" + +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) +#endif + +__kernel void m12600_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'A' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * salt + */ + + u32 pc256[8]; + + pc256[0] = salt_bufs[salt_pos].salt_buf_pc[0]; + pc256[1] = salt_bufs[salt_pos].salt_buf_pc[1]; + pc256[2] = salt_bufs[salt_pos].salt_buf_pc[2]; + pc256[3] = salt_bufs[salt_pos].salt_buf_pc[3]; + pc256[4] = salt_bufs[salt_pos].salt_buf_pc[4]; + pc256[5] = salt_bufs[salt_pos].salt_buf_pc[5]; + pc256[6] = salt_bufs[salt_pos].salt_buf_pc[6]; + pc256[7] = salt_bufs[salt_pos].salt_buf_pc[7]; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx0; + + sha1_init_vector (&ctx0); + + sha1_update_vector (&ctx0, w, pw_len); + + sha1_final_vector (&ctx0); + + const u32x a = ctx0.h[0]; + const u32x b = ctx0.h[1]; + const u32x c = ctx0.h[2]; + const u32x d = ctx0.h[3]; + const u32x e = ctx0.h[4]; + + sha256_ctx_vector_t ctx; + + ctx.h[0] = pc256[0]; + ctx.h[1] = pc256[1]; + ctx.h[2] = pc256[2]; + ctx.h[3] = pc256[3]; + ctx.h[4] = pc256[4]; + ctx.h[5] = pc256[5]; + ctx.h[6] = pc256[6]; + ctx.h[7] = pc256[7]; + + ctx.len = 64; + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + ctx.w2[2] = 0; + ctx.w2[3] = 0; + ctx.w3[0] = 0; + ctx.w3[1] = 0; + ctx.w3[2] = 0; + ctx.w3[3] = 0; + + ctx.len += 40; + + sha256_final_vector (&ctx); + + ctx.h[0] -= pc256[0]; + ctx.h[1] -= pc256[1]; + ctx.h[2] -= pc256[2]; + ctx.h[3] -= pc256[3]; + ctx.h[4] -= pc256[4]; + ctx.h[5] -= pc256[5]; + ctx.h[6] -= pc256[6]; + ctx.h[7] -= pc256[7]; + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m12600_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); + const u32 lsz = get_local_size (0); + + /** + * bin2asc table + */ + + __local u32 l_bin2asc[256]; + + for (u32 i = lid; i < 256; i += lsz) + { + const u32 i0 = (i >> 0) & 15; + const u32 i1 = (i >> 4) & 15; + + l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'A' - 10 + i0) << 0 + | ((i1 < 10) ? '0' + i1 : 'A' - 10 + i1) << 8; + } + + barrier (CLK_LOCAL_MEM_FENCE); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * salt + */ + + u32 pc256[8]; + + pc256[0] = salt_bufs[salt_pos].salt_buf_pc[0]; + pc256[1] = salt_bufs[salt_pos].salt_buf_pc[1]; + pc256[2] = salt_bufs[salt_pos].salt_buf_pc[2]; + pc256[3] = salt_bufs[salt_pos].salt_buf_pc[3]; + pc256[4] = salt_bufs[salt_pos].salt_buf_pc[4]; + pc256[5] = salt_bufs[salt_pos].salt_buf_pc[5]; + pc256[6] = salt_bufs[salt_pos].salt_buf_pc[6]; + pc256[7] = salt_bufs[salt_pos].salt_buf_pc[7]; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx0; + + sha1_init_vector (&ctx0); + + sha1_update_vector (&ctx0, w, pw_len); + + sha1_final_vector (&ctx0); + + const u32x a = ctx0.h[0]; + const u32x b = ctx0.h[1]; + const u32x c = ctx0.h[2]; + const u32x d = ctx0.h[3]; + const u32x e = ctx0.h[4]; + + sha256_ctx_vector_t ctx; + + ctx.h[0] = pc256[0]; + ctx.h[1] = pc256[1]; + ctx.h[2] = pc256[2]; + ctx.h[3] = pc256[3]; + ctx.h[4] = pc256[4]; + ctx.h[5] = pc256[5]; + ctx.h[6] = pc256[6]; + ctx.h[7] = pc256[7]; + + ctx.len = 64; + + ctx.w0[0] = uint_to_hex_lower8_le ((a >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 24) & 255) << 16; + ctx.w0[1] = uint_to_hex_lower8_le ((a >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 8) & 255) << 16; + ctx.w0[2] = uint_to_hex_lower8_le ((b >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 24) & 255) << 16; + ctx.w0[3] = uint_to_hex_lower8_le ((b >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 8) & 255) << 16; + ctx.w1[0] = uint_to_hex_lower8_le ((c >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 24) & 255) << 16; + ctx.w1[1] = uint_to_hex_lower8_le ((c >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 8) & 255) << 16; + ctx.w1[2] = uint_to_hex_lower8_le ((d >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 24) & 255) << 16; + ctx.w1[3] = uint_to_hex_lower8_le ((d >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 8) & 255) << 16; + ctx.w2[0] = uint_to_hex_lower8_le ((e >> 16) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 24) & 255) << 16; + ctx.w2[1] = uint_to_hex_lower8_le ((e >> 0) & 255) << 0 + | uint_to_hex_lower8_le ((e >> 8) & 255) << 16; + ctx.w2[2] = 0; + ctx.w2[3] = 0; + ctx.w3[0] = 0; + ctx.w3[1] = 0; + ctx.w3[2] = 0; + ctx.w3[3] = 0; + + ctx.len += 40; + + sha256_final_vector (&ctx); + + ctx.h[0] -= pc256[0]; + ctx.h[1] -= pc256[1]; + ctx.h[2] -= pc256[2]; + ctx.h[3] -= pc256[3]; + ctx.h[4] -= pc256[4]; + ctx.h[5] -= pc256[5]; + ctx.h[6] -= pc256[6]; + ctx.h[7] -= pc256[7]; + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} From 5c6b3fa7ab5c8d2baf637514286ad9388e700f74 Mon Sep 17 00:00:00 2001 From: jsteube Date: Mon, 7 Aug 2017 15:22:18 +0200 Subject: [PATCH 70/75] Add pure kernels for Kerberos 5 TGS-REP etype 23 --- OpenCL/m13100_a0.cl | 516 +++++++++++++++++++++++++++++++++++++++++++ OpenCL/m13100_a1.cl | 492 +++++++++++++++++++++++++++++++++++++++++ OpenCL/m13100_a3.cl | 526 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1534 insertions(+) create mode 100644 OpenCL/m13100_a0.cl create mode 100644 OpenCL/m13100_a1.cl create mode 100644 OpenCL/m13100_a3.cl diff --git a/OpenCL/m13100_a0.cl b/OpenCL/m13100_a0.cl new file mode 100644 index 000000000..d01619666 --- /dev/null +++ b/OpenCL/m13100_a0.cl @@ -0,0 +1,516 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//shared mem too small +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_hash_md4.cl" +#include "inc_hash_md5.cl" + +typedef struct +{ + u8 S[256]; + + u32 wtf_its_faster; + +} RC4_KEY; + +void swap (__local RC4_KEY *rc4_key, const u8 i, const u8 j) +{ + u8 tmp; + + tmp = rc4_key->S[i]; + rc4_key->S[i] = rc4_key->S[j]; + rc4_key->S[j] = tmp; +} + +void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4]) +{ + u32 v = 0x03020100; + u32 a = 0x04040404; + + __local u32 *ptr = (__local u32 *) rc4_key->S; + + #ifdef _unroll + #pragma unroll + #endif + for (u32 i = 0; i < 64; i++) + { + *ptr++ = v; v += a; + } + + u32 j = 0; + + for (u32 i = 0; i < 16; i++) + { + u32 idx = i * 16; + + u32 v; + + v = data[0]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[1]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[2]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[3]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + } +} + +u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, __global const u32 in[4], u32 out[4]) +{ + #ifdef _unroll + #pragma unroll + #endif + for (u32 k = 0; k < 4; k++) + { + u32 xor4 = 0; + + u8 idx; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 0; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 8; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 16; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 24; + + out[k] = in[k] ^ xor4; + } + + return j; +} + +int decrypt_and_check (__local RC4_KEY *rc4_key, u32 data[4], __global const u32 *edata2, const u32 edata2_len, const u32 K2[4], const u32 checksum[4]) +{ + rc4_init_16 (rc4_key, data); + + u32 out0[4]; + u32 out1[4]; + + u8 i = 0; + u8 j = 0; + + /* + 8 first bytes are nonce, then ASN1 structs (DER encoding: type-length-data) + + if length >= 128 bytes: + length is on 2 bytes and type is \x63\x82 (encode_krb5_enc_tkt_part) and data is an ASN1 sequence \x30\x82 + else: + length is on 1 byte and type is \x63\x81 and data is an ASN1 sequence \x30\x81 + + next headers follow the same ASN1 "type-length-data" scheme + */ + + j = rc4_next_16 (rc4_key, i, j, edata2 + 0, out0); i += 16; + + if (((out0[2] & 0xff00ffff) != 0x30008163) && ((out0[2] & 0x0000ffff) != 0x00008263)) return 0; + + j = rc4_next_16 (rc4_key, i, j, edata2 + 4, out1); i += 16; + + if (((out1[0] & 0x00ffffff) != 0x00000503) && (out1[0] != 0x050307A0)) return 0; + + rc4_init_16 (rc4_key, data); + + i = 0; + j = 0; + + // init hmac + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = K2[0]; + w0[1] = K2[1]; + w0[2] = K2[2]; + w0[3] = K2[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + int edata2_left; + + for (edata2_left = edata2_len; edata2_left >= 64; edata2_left -= 64) + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w1); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w2); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w3); i += 16; edata2 += 4; + + md5_hmac_update_64 (&ctx, w0, w1, w2, w3, 64); + } + + w0[0] = 0; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + if (edata2_left < 16) + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + + truncate_block_4x4_le (w0, edata2_left & 0xf); + } + else if (edata2_left < 32) + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w1); i += 16; edata2 += 4; + + truncate_block_4x4_le (w1, edata2_left & 0xf); + } + else if (edata2_left < 48) + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w1); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w2); i += 16; edata2 += 4; + + truncate_block_4x4_le (w2, edata2_left & 0xf); + } + else + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w1); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w2); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w3); i += 16; edata2 += 4; + + truncate_block_4x4_le (w3, edata2_left & 0xf); + } + + md5_hmac_update_64 (&ctx, w0, w1, w2, w3, edata2_left); + + md5_hmac_final (&ctx); + + if (checksum[0] != ctx.opad.h[0]) return 0; + if (checksum[1] != ctx.opad.h[1]) return 0; + if (checksum[2] != ctx.opad.h[2]) return 0; + if (checksum[3] != ctx.opad.h[3]) return 0; + + return 1; +} + +void kerb_prepare (const u32 K[4], const u32 checksum[4], u32 digest[4], u32 K2[4]) +{ + // K1=MD5_HMAC(K,1); with 1 encoded as little indian on 4 bytes (01000000 in hexa); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = K[0]; + w0[1] = K[1]; + w0[2] = K[2]; + w0[3] = K[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx1; + + md5_hmac_init_64 (&ctx1, w0, w1, w2, w3); + + w0[0] = 2; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_update_64 (&ctx1, w0, w1, w2, w3, 4); + + md5_hmac_final (&ctx1); + + w0[0] = ctx1.opad.h[0]; + w0[1] = ctx1.opad.h[1]; + w0[2] = ctx1.opad.h[2]; + w0[3] = ctx1.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + w0[0] = checksum[0]; + w0[1] = checksum[1]; + w0[2] = checksum[2]; + w0[3] = checksum[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_update_64 (&ctx, w0, w1, w2, w3, 16); + + md5_hmac_final (&ctx); + + digest[0] = ctx.opad.h[0]; + digest[1] = ctx.opad.h[1]; + digest[2] = ctx.opad.h[2]; + digest[3] = ctx.opad.h[3]; + + K2[0] = ctx1.opad.h[0]; + K2[1] = ctx1.opad.h[1]; + K2[2] = ctx1.opad.h[2]; + K2[3] = ctx1.opad.h[3]; +} + +__kernel void m13100_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const krb5tgs_t *krb5tgs_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + __local RC4_KEY rc4_keys[64]; + + u32 checksum[4]; + + checksum[0] = krb5tgs_bufs[digests_offset].checksum[0]; + checksum[1] = krb5tgs_bufs[digests_offset].checksum[1]; + checksum[2] = krb5tgs_bufs[digests_offset].checksum[2]; + checksum[3] = krb5tgs_bufs[digests_offset].checksum[3]; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md4_ctx_t ctx; + + md4_init (&ctx); + + md4_update_utf16le (&ctx, w, pw_len); + + md4_final (&ctx); + + u32 digest[4]; + + u32 K2[4]; + + kerb_prepare (ctx.h, checksum, digest, K2); + + if (decrypt_and_check (&rc4_keys[lid], digest, krb5tgs_bufs[digests_offset].edata2, krb5tgs_bufs[digests_offset].edata2_len, K2, checksum) == 1) + { + if (atomic_inc (&hashes_shown[digests_offset]) == 0) + { + mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset + 0, gid, il_pos); + } + } + } +} + +__kernel void m13100_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const krb5tgs_t *krb5tgs_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + __local RC4_KEY rc4_keys[64]; + + u32 checksum[4]; + + checksum[0] = krb5tgs_bufs[digests_offset].checksum[0]; + checksum[1] = krb5tgs_bufs[digests_offset].checksum[1]; + checksum[2] = krb5tgs_bufs[digests_offset].checksum[2]; + checksum[3] = krb5tgs_bufs[digests_offset].checksum[3]; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + md4_ctx_t ctx; + + md4_init (&ctx); + + md4_update_utf16le (&ctx, w, pw_len); + + md4_final (&ctx); + + u32 digest[4]; + + u32 K2[4]; + + kerb_prepare (ctx.h, checksum, digest, K2); + + if (decrypt_and_check (&rc4_keys[lid], digest, krb5tgs_bufs[digests_offset].edata2, krb5tgs_bufs[digests_offset].edata2_len, K2, checksum) == 1) + { + if (atomic_inc (&hashes_shown[digests_offset]) == 0) + { + mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset + 0, gid, il_pos); + } + } + } +} diff --git a/OpenCL/m13100_a1.cl b/OpenCL/m13100_a1.cl new file mode 100644 index 000000000..38b1ed643 --- /dev/null +++ b/OpenCL/m13100_a1.cl @@ -0,0 +1,492 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//shared mem too small +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_hash_md4.cl" +#include "inc_hash_md5.cl" + +typedef struct +{ + u8 S[256]; + + u32 wtf_its_faster; + +} RC4_KEY; + +void swap (__local RC4_KEY *rc4_key, const u8 i, const u8 j) +{ + u8 tmp; + + tmp = rc4_key->S[i]; + rc4_key->S[i] = rc4_key->S[j]; + rc4_key->S[j] = tmp; +} + +void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4]) +{ + u32 v = 0x03020100; + u32 a = 0x04040404; + + __local u32 *ptr = (__local u32 *) rc4_key->S; + + #ifdef _unroll + #pragma unroll + #endif + for (u32 i = 0; i < 64; i++) + { + *ptr++ = v; v += a; + } + + u32 j = 0; + + for (u32 i = 0; i < 16; i++) + { + u32 idx = i * 16; + + u32 v; + + v = data[0]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[1]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[2]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[3]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + } +} + +u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, __global const u32 in[4], u32 out[4]) +{ + #ifdef _unroll + #pragma unroll + #endif + for (u32 k = 0; k < 4; k++) + { + u32 xor4 = 0; + + u8 idx; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 0; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 8; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 16; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 24; + + out[k] = in[k] ^ xor4; + } + + return j; +} + +int decrypt_and_check (__local RC4_KEY *rc4_key, u32 data[4], __global const u32 *edata2, const u32 edata2_len, const u32 K2[4], const u32 checksum[4]) +{ + rc4_init_16 (rc4_key, data); + + u32 out0[4]; + u32 out1[4]; + + u8 i = 0; + u8 j = 0; + + /* + 8 first bytes are nonce, then ASN1 structs (DER encoding: type-length-data) + + if length >= 128 bytes: + length is on 2 bytes and type is \x63\x82 (encode_krb5_enc_tkt_part) and data is an ASN1 sequence \x30\x82 + else: + length is on 1 byte and type is \x63\x81 and data is an ASN1 sequence \x30\x81 + + next headers follow the same ASN1 "type-length-data" scheme + */ + + j = rc4_next_16 (rc4_key, i, j, edata2 + 0, out0); i += 16; + + if (((out0[2] & 0xff00ffff) != 0x30008163) && ((out0[2] & 0x0000ffff) != 0x00008263)) return 0; + + j = rc4_next_16 (rc4_key, i, j, edata2 + 4, out1); i += 16; + + if (((out1[0] & 0x00ffffff) != 0x00000503) && (out1[0] != 0x050307A0)) return 0; + + rc4_init_16 (rc4_key, data); + + i = 0; + j = 0; + + // init hmac + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = K2[0]; + w0[1] = K2[1]; + w0[2] = K2[2]; + w0[3] = K2[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + int edata2_left; + + for (edata2_left = edata2_len; edata2_left >= 64; edata2_left -= 64) + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w1); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w2); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w3); i += 16; edata2 += 4; + + md5_hmac_update_64 (&ctx, w0, w1, w2, w3, 64); + } + + w0[0] = 0; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + if (edata2_left < 16) + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + + truncate_block_4x4_le (w0, edata2_left & 0xf); + } + else if (edata2_left < 32) + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w1); i += 16; edata2 += 4; + + truncate_block_4x4_le (w1, edata2_left & 0xf); + } + else if (edata2_left < 48) + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w1); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w2); i += 16; edata2 += 4; + + truncate_block_4x4_le (w2, edata2_left & 0xf); + } + else + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w1); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w2); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w3); i += 16; edata2 += 4; + + truncate_block_4x4_le (w3, edata2_left & 0xf); + } + + md5_hmac_update_64 (&ctx, w0, w1, w2, w3, edata2_left); + + md5_hmac_final (&ctx); + + if (checksum[0] != ctx.opad.h[0]) return 0; + if (checksum[1] != ctx.opad.h[1]) return 0; + if (checksum[2] != ctx.opad.h[2]) return 0; + if (checksum[3] != ctx.opad.h[3]) return 0; + + return 1; +} + +void kerb_prepare (const u32 K[4], const u32 checksum[4], u32 digest[4], u32 K2[4]) +{ + // K1=MD5_HMAC(K,1); with 1 encoded as little indian on 4 bytes (01000000 in hexa); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = K[0]; + w0[1] = K[1]; + w0[2] = K[2]; + w0[3] = K[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx1; + + md5_hmac_init_64 (&ctx1, w0, w1, w2, w3); + + w0[0] = 2; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_update_64 (&ctx1, w0, w1, w2, w3, 4); + + md5_hmac_final (&ctx1); + + w0[0] = ctx1.opad.h[0]; + w0[1] = ctx1.opad.h[1]; + w0[2] = ctx1.opad.h[2]; + w0[3] = ctx1.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + w0[0] = checksum[0]; + w0[1] = checksum[1]; + w0[2] = checksum[2]; + w0[3] = checksum[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_update_64 (&ctx, w0, w1, w2, w3, 16); + + md5_hmac_final (&ctx); + + digest[0] = ctx.opad.h[0]; + digest[1] = ctx.opad.h[1]; + digest[2] = ctx.opad.h[2]; + digest[3] = ctx.opad.h[3]; + + K2[0] = ctx1.opad.h[0]; + K2[1] = ctx1.opad.h[1]; + K2[2] = ctx1.opad.h[2]; + K2[3] = ctx1.opad.h[3]; +} + +__kernel void m13100_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const krb5tgs_t *krb5tgs_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + __local RC4_KEY rc4_keys[64]; + + u32 checksum[4]; + + checksum[0] = krb5tgs_bufs[digests_offset].checksum[0]; + checksum[1] = krb5tgs_bufs[digests_offset].checksum[1]; + checksum[2] = krb5tgs_bufs[digests_offset].checksum[2]; + checksum[3] = krb5tgs_bufs[digests_offset].checksum[3]; + + md4_ctx_t ctx0; + + md4_init (&ctx0); + + md4_update_global_utf16le (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md4_ctx_t ctx = ctx0; + + md4_update_global_utf16le (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md4_final (&ctx); + + u32 digest[4]; + + u32 K2[4]; + + kerb_prepare (ctx.h, checksum, digest, K2); + + if (decrypt_and_check (&rc4_keys[lid], digest, krb5tgs_bufs[digests_offset].edata2, krb5tgs_bufs[digests_offset].edata2_len, K2, checksum) == 1) + { + if (atomic_inc (&hashes_shown[digests_offset]) == 0) + { + mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset + 0, gid, il_pos); + } + } + } +} + +__kernel void m13100_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const krb5tgs_t *krb5tgs_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + __local RC4_KEY rc4_keys[64]; + + u32 checksum[4]; + + checksum[0] = krb5tgs_bufs[digests_offset].checksum[0]; + checksum[1] = krb5tgs_bufs[digests_offset].checksum[1]; + checksum[2] = krb5tgs_bufs[digests_offset].checksum[2]; + checksum[3] = krb5tgs_bufs[digests_offset].checksum[3]; + + md4_ctx_t ctx0; + + md4_init (&ctx0); + + md4_update_global_utf16le (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + md4_ctx_t ctx = ctx0; + + md4_update_global_utf16le (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + md4_final (&ctx); + + u32 digest[4]; + + u32 K2[4]; + + kerb_prepare (ctx.h, checksum, digest, K2); + + if (decrypt_and_check (&rc4_keys[lid], digest, krb5tgs_bufs[digests_offset].edata2, krb5tgs_bufs[digests_offset].edata2_len, K2, checksum) == 1) + { + if (atomic_inc (&hashes_shown[digests_offset]) == 0) + { + mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset + 0, gid, il_pos); + } + } + } +} diff --git a/OpenCL/m13100_a3.cl b/OpenCL/m13100_a3.cl new file mode 100644 index 000000000..7b1ac5a05 --- /dev/null +++ b/OpenCL/m13100_a3.cl @@ -0,0 +1,526 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//shared mem too small +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_hash_md4.cl" +#include "inc_hash_md5.cl" + +typedef struct +{ + u8 S[256]; + + u32 wtf_its_faster; + +} RC4_KEY; + +void swap (__local RC4_KEY *rc4_key, const u8 i, const u8 j) +{ + u8 tmp; + + tmp = rc4_key->S[i]; + rc4_key->S[i] = rc4_key->S[j]; + rc4_key->S[j] = tmp; +} + +void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4]) +{ + u32 v = 0x03020100; + u32 a = 0x04040404; + + __local u32 *ptr = (__local u32 *) rc4_key->S; + + #ifdef _unroll + #pragma unroll + #endif + for (u32 i = 0; i < 64; i++) + { + *ptr++ = v; v += a; + } + + u32 j = 0; + + for (u32 i = 0; i < 16; i++) + { + u32 idx = i * 16; + + u32 v; + + v = data[0]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[1]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[2]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + + v = data[3]; + + j += rc4_key->S[idx] + (v >> 0); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 8); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 16); swap (rc4_key, idx, j); idx++; + j += rc4_key->S[idx] + (v >> 24); swap (rc4_key, idx, j); idx++; + } +} + +u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, __global const u32 in[4], u32 out[4]) +{ + #ifdef _unroll + #pragma unroll + #endif + for (u32 k = 0; k < 4; k++) + { + u32 xor4 = 0; + + u8 idx; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 0; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 8; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 16; + + i += 1; + j += rc4_key->S[i]; + + swap (rc4_key, i, j); + + idx = rc4_key->S[i] + rc4_key->S[j]; + + xor4 |= rc4_key->S[idx] << 24; + + out[k] = in[k] ^ xor4; + } + + return j; +} + +int decrypt_and_check (__local RC4_KEY *rc4_key, u32 data[4], __global const u32 *edata2, const u32 edata2_len, const u32 K2[4], const u32 checksum[4]) +{ + rc4_init_16 (rc4_key, data); + + u32 out0[4]; + u32 out1[4]; + + u8 i = 0; + u8 j = 0; + + /* + 8 first bytes are nonce, then ASN1 structs (DER encoding: type-length-data) + + if length >= 128 bytes: + length is on 2 bytes and type is \x63\x82 (encode_krb5_enc_tkt_part) and data is an ASN1 sequence \x30\x82 + else: + length is on 1 byte and type is \x63\x81 and data is an ASN1 sequence \x30\x81 + + next headers follow the same ASN1 "type-length-data" scheme + */ + + j = rc4_next_16 (rc4_key, i, j, edata2 + 0, out0); i += 16; + + if (((out0[2] & 0xff00ffff) != 0x30008163) && ((out0[2] & 0x0000ffff) != 0x00008263)) return 0; + + j = rc4_next_16 (rc4_key, i, j, edata2 + 4, out1); i += 16; + + if (((out1[0] & 0x00ffffff) != 0x00000503) && (out1[0] != 0x050307A0)) return 0; + + rc4_init_16 (rc4_key, data); + + i = 0; + j = 0; + + // init hmac + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = K2[0]; + w0[1] = K2[1]; + w0[2] = K2[2]; + w0[3] = K2[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + int edata2_left; + + for (edata2_left = edata2_len; edata2_left >= 64; edata2_left -= 64) + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w1); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w2); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w3); i += 16; edata2 += 4; + + md5_hmac_update_64 (&ctx, w0, w1, w2, w3, 64); + } + + w0[0] = 0; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + if (edata2_left < 16) + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + + truncate_block_4x4_le (w0, edata2_left & 0xf); + } + else if (edata2_left < 32) + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w1); i += 16; edata2 += 4; + + truncate_block_4x4_le (w1, edata2_left & 0xf); + } + else if (edata2_left < 48) + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w1); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w2); i += 16; edata2 += 4; + + truncate_block_4x4_le (w2, edata2_left & 0xf); + } + else + { + j = rc4_next_16 (rc4_key, i, j, edata2, w0); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w1); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w2); i += 16; edata2 += 4; + j = rc4_next_16 (rc4_key, i, j, edata2, w3); i += 16; edata2 += 4; + + truncate_block_4x4_le (w3, edata2_left & 0xf); + } + + md5_hmac_update_64 (&ctx, w0, w1, w2, w3, edata2_left); + + md5_hmac_final (&ctx); + + if (checksum[0] != ctx.opad.h[0]) return 0; + if (checksum[1] != ctx.opad.h[1]) return 0; + if (checksum[2] != ctx.opad.h[2]) return 0; + if (checksum[3] != ctx.opad.h[3]) return 0; + + return 1; +} + +void kerb_prepare (const u32 K[4], const u32 checksum[4], u32 digest[4], u32 K2[4]) +{ + // K1=MD5_HMAC(K,1); with 1 encoded as little indian on 4 bytes (01000000 in hexa); + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = K[0]; + w0[1] = K[1]; + w0[2] = K[2]; + w0[3] = K[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx1; + + md5_hmac_init_64 (&ctx1, w0, w1, w2, w3); + + w0[0] = 2; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_update_64 (&ctx1, w0, w1, w2, w3, 4); + + md5_hmac_final (&ctx1); + + w0[0] = ctx1.opad.h[0]; + w0[1] = ctx1.opad.h[1]; + w0[2] = ctx1.opad.h[2]; + w0[3] = ctx1.opad.h[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_ctx_t ctx; + + md5_hmac_init_64 (&ctx, w0, w1, w2, w3); + + w0[0] = checksum[0]; + w0[1] = checksum[1]; + w0[2] = checksum[2]; + w0[3] = checksum[3]; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + md5_hmac_update_64 (&ctx, w0, w1, w2, w3, 16); + + md5_hmac_final (&ctx); + + digest[0] = ctx.opad.h[0]; + digest[1] = ctx.opad.h[1]; + digest[2] = ctx.opad.h[2]; + digest[3] = ctx.opad.h[3]; + + K2[0] = ctx1.opad.h[0]; + K2[1] = ctx1.opad.h[1]; + K2[2] = ctx1.opad.h[2]; + K2[3] = ctx1.opad.h[3]; +} + +__kernel void m13100_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const krb5tgs_t *krb5tgs_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + __local RC4_KEY rc4_keys[64]; + + u32 checksum[4]; + + checksum[0] = krb5tgs_bufs[digests_offset].checksum[0]; + checksum[1] = krb5tgs_bufs[digests_offset].checksum[1]; + checksum[2] = krb5tgs_bufs[digests_offset].checksum[2]; + checksum[3] = krb5tgs_bufs[digests_offset].checksum[3]; + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md4_ctx_t ctx; + + md4_init (&ctx); + + md4_update_utf16le (&ctx, w, pw_len); + + md4_final (&ctx); + + u32 digest[4]; + + u32 K2[4]; + + kerb_prepare (ctx.h, checksum, digest, K2); + + if (decrypt_and_check (&rc4_keys[lid], digest, krb5tgs_bufs[digests_offset].edata2, krb5tgs_bufs[digests_offset].edata2_len, K2, checksum) == 1) + { + if (atomic_inc (&hashes_shown[digests_offset]) == 0) + { + mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset + 0, gid, il_pos); + } + } + } +} + +__kernel void m13100_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const krb5tgs_t *krb5tgs_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + __local RC4_KEY rc4_keys[64]; + + u32 checksum[4]; + + checksum[0] = krb5tgs_bufs[digests_offset].checksum[0]; + checksum[1] = krb5tgs_bufs[digests_offset].checksum[1]; + checksum[2] = krb5tgs_bufs[digests_offset].checksum[2]; + checksum[3] = krb5tgs_bufs[digests_offset].checksum[3]; + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + md4_ctx_t ctx; + + md4_init (&ctx); + + md4_update_utf16le (&ctx, w, pw_len); + + md4_final (&ctx); + + u32 digest[4]; + + u32 K2[4]; + + kerb_prepare (ctx.h, checksum, digest, K2); + + if (decrypt_and_check (&rc4_keys[lid], digest, krb5tgs_bufs[digests_offset].edata2, krb5tgs_bufs[digests_offset].edata2_len, K2, checksum) == 1) + { + if (atomic_inc (&hashes_shown[digests_offset]) == 0) + { + mark_hash (plains_buf, d_return_buf, salt_pos, digests_cnt, 0, digests_offset + 0, gid, il_pos); + } + } + } +} From b9876c100bab66a2c29aa52c0a7e9e308b85eaad Mon Sep 17 00:00:00 2001 From: jsteube Date: Mon, 7 Aug 2017 15:28:23 +0200 Subject: [PATCH 71/75] Add pure kernels for AxCrypt in-memory SHA1 --- OpenCL/m13300_a0.cl | 138 +++++++++++++++++++++++++++++++++++++++++ OpenCL/m13300_a1.cl | 114 ++++++++++++++++++++++++++++++++++ OpenCL/m13300_a3.cl | 148 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 400 insertions(+) create mode 100644 OpenCL/m13300_a0.cl create mode 100644 OpenCL/m13300_a1.cl create mode 100644 OpenCL/m13300_a3.cl diff --git a/OpenCL/m13300_a0.cl b/OpenCL/m13300_a0.cl new file mode 100644 index 000000000..0c3a005b0 --- /dev/null +++ b/OpenCL/m13300_a0.cl @@ -0,0 +1,138 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m13300_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx; + + sha1_init (&ctx); + + sha1_update_swap (&ctx, w, pw_len); + + sha1_final (&ctx); + + ctx.h[4] = 0; + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m13300_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx; + + sha1_init (&ctx); + + sha1_update_swap (&ctx, w, pw_len); + + sha1_final (&ctx); + + ctx.h[4] = 0; + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m13300_a1.cl b/OpenCL/m13300_a1.cl new file mode 100644 index 000000000..c17774d88 --- /dev/null +++ b/OpenCL/m13300_a1.cl @@ -0,0 +1,114 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m13300_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx = ctx0; + + sha1_update_global_swap (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx); + + ctx.h[4] = 0; + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m13300_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + sha1_ctx_t ctx0; + + sha1_init (&ctx0); + + sha1_update_global_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx = ctx0; + + sha1_update_global_swap (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx); + + ctx.h[4] = 0; + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m13300_a3.cl b/OpenCL/m13300_a3.cl new file mode 100644 index 000000000..8b69186a2 --- /dev/null +++ b/OpenCL/m13300_a3.cl @@ -0,0 +1,148 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_sha1.cl" + +__kernel void m13300_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx; + + sha1_init_vector (&ctx); + + sha1_update_vector (&ctx, w, pw_len); + + sha1_final_vector (&ctx); + + ctx.h[4] = 0; + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m13300_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + + barrier (CLK_GLOBAL_MEM_FENCE); + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx; + + sha1_init_vector (&ctx); + + sha1_update_vector (&ctx, w, pw_len); + + sha1_final_vector (&ctx); + + ctx.h[4] = 0; + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} From 0a676b549f1bdb5ae3a2d2d1d8e2bd36a9817451 Mon Sep 17 00:00:00 2001 From: jsteube Date: Mon, 7 Aug 2017 17:25:15 +0200 Subject: [PATCH 72/75] Remove global barrier when not needed to workaround Intel OpenCL runtime bug --- OpenCL/m00000_a0.cl | 4 ---- OpenCL/m00000_a3.cl | 4 ---- OpenCL/m00010_a0.cl | 8 -------- OpenCL/m00010_a1.cl | 4 ---- OpenCL/m00010_a3.cl | 8 -------- OpenCL/m00020_a0.cl | 4 ---- OpenCL/m00020_a3.cl | 4 ---- OpenCL/m00030_a0.cl | 8 -------- OpenCL/m00030_a1.cl | 4 ---- OpenCL/m00030_a3.cl | 8 -------- OpenCL/m00040_a0.cl | 4 ---- OpenCL/m00040_a3.cl | 4 ---- OpenCL/m00050_a0.cl | 8 -------- OpenCL/m00050_a1.cl | 8 -------- OpenCL/m00050_a3.cl | 8 -------- OpenCL/m00060_a0.cl | 8 -------- OpenCL/m00060_a1.cl | 8 -------- OpenCL/m00060_a3.cl | 8 -------- OpenCL/m00100_a0.cl | 4 ---- OpenCL/m00100_a3.cl | 4 ---- OpenCL/m00110_a0.cl | 8 -------- OpenCL/m00110_a1.cl | 4 ---- OpenCL/m00110_a3.cl | 8 -------- OpenCL/m00120_a0.cl | 4 ---- OpenCL/m00120_a3.cl | 4 ---- OpenCL/m00130_a0.cl | 8 -------- OpenCL/m00130_a1.cl | 4 ---- OpenCL/m00130_a3.cl | 8 -------- OpenCL/m00140_a0.cl | 4 ---- OpenCL/m00140_a3.cl | 4 ---- OpenCL/m00150_a0.cl | 8 -------- OpenCL/m00150_a1.cl | 8 -------- OpenCL/m00150_a3.cl | 8 -------- OpenCL/m00160_a0.cl | 8 -------- OpenCL/m00160_a1.cl | 8 -------- OpenCL/m00160_a3.cl | 8 -------- OpenCL/m00300_a0.cl | 4 ---- OpenCL/m00300_a3.cl | 4 ---- OpenCL/m00400.cl | 2 -- OpenCL/m00500.cl | 8 -------- OpenCL/m00900_a0.cl | 4 ---- OpenCL/m00900_a3.cl | 4 ---- OpenCL/m01000_a0.cl | 4 ---- OpenCL/m01000_a3.cl | 4 ---- OpenCL/m01100_a0.cl | 8 -------- OpenCL/m01100_a1.cl | 4 ---- OpenCL/m01100_a3.cl | 8 -------- OpenCL/m01300_a0.cl | 4 ---- OpenCL/m01300_a3.cl | 4 ---- OpenCL/m01400_a0.cl | 4 ---- OpenCL/m01400_a3.cl | 4 ---- OpenCL/m01410_a0.cl | 8 -------- OpenCL/m01410_a1.cl | 4 ---- OpenCL/m01410_a3.cl | 8 -------- OpenCL/m01420_a0.cl | 4 ---- OpenCL/m01420_a3.cl | 4 ---- OpenCL/m01430_a0.cl | 8 -------- OpenCL/m01430_a1.cl | 4 ---- OpenCL/m01430_a3.cl | 8 -------- OpenCL/m01440_a0.cl | 4 ---- OpenCL/m01440_a3.cl | 4 ---- OpenCL/m01450_a0.cl | 8 -------- OpenCL/m01450_a1.cl | 8 -------- OpenCL/m01450_a3.cl | 8 -------- OpenCL/m01460_a0.cl | 8 -------- OpenCL/m01460_a1.cl | 8 -------- OpenCL/m01460_a3.cl | 8 -------- OpenCL/m01600.cl | 8 -------- OpenCL/m01700_a0.cl | 4 ---- OpenCL/m01700_a3.cl | 4 ---- OpenCL/m01710_a0.cl | 8 -------- OpenCL/m01710_a1.cl | 4 ---- OpenCL/m01710_a3.cl | 8 -------- OpenCL/m01720_a0.cl | 4 ---- OpenCL/m01720_a3.cl | 4 ---- OpenCL/m01730_a0.cl | 8 -------- OpenCL/m01730_a1.cl | 4 ---- OpenCL/m01730_a3.cl | 8 -------- OpenCL/m01740_a0.cl | 4 ---- OpenCL/m01740_a3.cl | 4 ---- OpenCL/m01750_a0.cl | 8 -------- OpenCL/m01750_a1.cl | 8 -------- OpenCL/m01750_a3.cl | 8 -------- OpenCL/m01760_a0.cl | 8 -------- OpenCL/m01760_a1.cl | 8 -------- OpenCL/m01760_a3.cl | 8 -------- OpenCL/m01800.cl | 4 ---- OpenCL/m02610_a0.cl | 8 -------- OpenCL/m02610_a1.cl | 4 ---- OpenCL/m02610_a3.cl | 8 -------- OpenCL/m02810_a0.cl | 8 -------- OpenCL/m02810_a1.cl | 4 ---- OpenCL/m02810_a3.cl | 8 -------- OpenCL/m03710_a0.cl | 8 -------- OpenCL/m03710_a1.cl | 4 ---- OpenCL/m03710_a3.cl | 8 -------- OpenCL/m03800_a0.cl | 8 -------- OpenCL/m03800_a1.cl | 4 ---- OpenCL/m03800_a3.cl | 8 -------- OpenCL/m03910_a0.cl | 8 -------- OpenCL/m03910_a1.cl | 4 ---- OpenCL/m03910_a3.cl | 8 -------- OpenCL/m04010_a0.cl | 4 ---- OpenCL/m04010_a3.cl | 4 ---- OpenCL/m04110_a0.cl | 8 -------- OpenCL/m04110_a1.cl | 4 ---- OpenCL/m04110_a3.cl | 8 -------- OpenCL/m04310_a0.cl | 8 -------- OpenCL/m04310_a1.cl | 4 ---- OpenCL/m04310_a3.cl | 8 -------- OpenCL/m04400_a0.cl | 4 ---- OpenCL/m04400_a3.cl | 4 ---- OpenCL/m04500_a0.cl | 4 ---- OpenCL/m04500_a3.cl | 4 ---- OpenCL/m04520_a0.cl | 4 ---- OpenCL/m04520_a3.cl | 4 ---- OpenCL/m04700_a0.cl | 4 ---- OpenCL/m04700_a3.cl | 4 ---- OpenCL/m04800_a0.cl | 4 ---- OpenCL/m04800_a3.cl | 4 ---- OpenCL/m04900_a0.cl | 8 -------- OpenCL/m04900_a1.cl | 4 ---- OpenCL/m04900_a3.cl | 8 -------- OpenCL/m05100_a0.cl | 4 ---- OpenCL/m05100_a3.cl | 4 ---- OpenCL/m05300_a0.cl | 4 ---- OpenCL/m05300_a1.cl | 4 ---- OpenCL/m05300_a3.cl | 4 ---- OpenCL/m05400_a0.cl | 4 ---- OpenCL/m05400_a1.cl | 4 ---- OpenCL/m05400_a3.cl | 4 ---- OpenCL/m05500_a0.cl | 4 ---- OpenCL/m05500_a3.cl | 4 ---- OpenCL/m05600_a0.cl | 4 ---- OpenCL/m05600_a3.cl | 4 ---- OpenCL/m05800.cl | 4 ---- OpenCL/m06000_a0.cl | 4 ---- OpenCL/m06000_a3.cl | 4 ---- OpenCL/m06100_a0.cl | 4 ---- OpenCL/m06100_a3.cl | 4 ---- OpenCL/m06300.cl | 8 -------- OpenCL/m07000_a0.cl | 4 ---- OpenCL/m07000_a3.cl | 4 ---- OpenCL/m07300_a0.cl | 4 ---- OpenCL/m07300_a1.cl | 4 ---- OpenCL/m07300_a3.cl | 4 ---- OpenCL/m07400.cl | 4 ---- OpenCL/m07500_a0.cl | 4 ---- OpenCL/m07500_a3.cl | 4 ---- OpenCL/m07900.cl | 2 -- OpenCL/m08100_a0.cl | 4 ---- OpenCL/m08100_a3.cl | 4 ---- OpenCL/m08300_a0.cl | 12 ------------ OpenCL/m08300_a1.cl | 8 -------- OpenCL/m08300_a3.cl | 12 ------------ OpenCL/m08400_a0.cl | 4 ---- OpenCL/m08400_a3.cl | 4 ---- OpenCL/m08900.cl | 4 ---- OpenCL/m09900_a0.cl | 4 ---- OpenCL/m09900_a3.cl | 4 ---- OpenCL/m10700.cl | 2 -- OpenCL/m10800_a0.cl | 4 ---- OpenCL/m10800_a3.cl | 4 ---- OpenCL/m11000_a0.cl | 4 ---- OpenCL/m11000_a3.cl | 4 ---- OpenCL/m11100_a0.cl | 4 ---- OpenCL/m11100_a3.cl | 4 ---- OpenCL/m11200_a0.cl | 4 ---- OpenCL/m11200_a3.cl | 4 ---- OpenCL/m11400_a0.cl | 4 ---- OpenCL/m11400_a3.cl | 8 -------- OpenCL/m11600.cl | 2 -- OpenCL/m12400.cl | 2 -- OpenCL/m12600_a0.cl | 4 ---- OpenCL/m12600_a3.cl | 4 ---- OpenCL/m13100_a0.cl | 4 ---- OpenCL/m13100_a3.cl | 4 ---- OpenCL/m13300_a0.cl | 4 ---- OpenCL/m13300_a3.cl | 4 ---- OpenCL/m15700.cl | 4 ---- 180 files changed, 978 deletions(-) diff --git a/OpenCL/m00000_a0.cl b/OpenCL/m00000_a0.cl index a30464522..82690362f 100644 --- a/OpenCL/m00000_a0.cl +++ b/OpenCL/m00000_a0.cl @@ -39,8 +39,6 @@ __kernel void m00000_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -104,8 +102,6 @@ __kernel void m00000_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00000_a3.cl b/OpenCL/m00000_a3.cl index 175c759fd..cb172a09d 100644 --- a/OpenCL/m00000_a3.cl +++ b/OpenCL/m00000_a3.cl @@ -37,8 +37,6 @@ __kernel void m00000_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -108,8 +106,6 @@ __kernel void m00000_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00010_a0.cl b/OpenCL/m00010_a0.cl index f353def68..76e002021 100644 --- a/OpenCL/m00010_a0.cl +++ b/OpenCL/m00010_a0.cl @@ -39,8 +39,6 @@ __kernel void m00010_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m00010_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -119,8 +115,6 @@ __kernel void m00010_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -132,8 +126,6 @@ __kernel void m00010_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00010_a1.cl b/OpenCL/m00010_a1.cl index 7a18b79d6..b970e65e5 100644 --- a/OpenCL/m00010_a1.cl +++ b/OpenCL/m00010_a1.cl @@ -37,8 +37,6 @@ __kernel void m00010_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -106,8 +104,6 @@ __kernel void m00010_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m00010_a3.cl b/OpenCL/m00010_a3.cl index 304bad19d..02bbb9768 100644 --- a/OpenCL/m00010_a3.cl +++ b/OpenCL/m00010_a3.cl @@ -37,8 +37,6 @@ __kernel void m00010_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m00010_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -123,8 +119,6 @@ __kernel void m00010_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -136,8 +130,6 @@ __kernel void m00010_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00020_a0.cl b/OpenCL/m00020_a0.cl index ec7f7187d..4cd3e5e84 100644 --- a/OpenCL/m00020_a0.cl +++ b/OpenCL/m00020_a0.cl @@ -39,8 +39,6 @@ __kernel void m00020_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -108,8 +106,6 @@ __kernel void m00020_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m00020_a3.cl b/OpenCL/m00020_a3.cl index fb46e8374..c22d90731 100644 --- a/OpenCL/m00020_a3.cl +++ b/OpenCL/m00020_a3.cl @@ -37,8 +37,6 @@ __kernel void m00020_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -114,8 +112,6 @@ __kernel void m00020_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m00030_a0.cl b/OpenCL/m00030_a0.cl index 81fe98a62..349802d62 100644 --- a/OpenCL/m00030_a0.cl +++ b/OpenCL/m00030_a0.cl @@ -39,8 +39,6 @@ __kernel void m00030_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m00030_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -119,8 +115,6 @@ __kernel void m00030_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -132,8 +126,6 @@ __kernel void m00030_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00030_a1.cl b/OpenCL/m00030_a1.cl index 2160c48e9..babf8745d 100644 --- a/OpenCL/m00030_a1.cl +++ b/OpenCL/m00030_a1.cl @@ -37,8 +37,6 @@ __kernel void m00030_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -106,8 +104,6 @@ __kernel void m00030_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m00030_a3.cl b/OpenCL/m00030_a3.cl index 10c2b0c00..d750b40cb 100644 --- a/OpenCL/m00030_a3.cl +++ b/OpenCL/m00030_a3.cl @@ -37,8 +37,6 @@ __kernel void m00030_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m00030_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -123,8 +119,6 @@ __kernel void m00030_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -136,8 +130,6 @@ __kernel void m00030_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00040_a0.cl b/OpenCL/m00040_a0.cl index bf4aa3ff9..af4b80719 100644 --- a/OpenCL/m00040_a0.cl +++ b/OpenCL/m00040_a0.cl @@ -39,8 +39,6 @@ __kernel void m00040_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -108,8 +106,6 @@ __kernel void m00040_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m00040_a3.cl b/OpenCL/m00040_a3.cl index be5995613..af341b374 100644 --- a/OpenCL/m00040_a3.cl +++ b/OpenCL/m00040_a3.cl @@ -37,8 +37,6 @@ __kernel void m00040_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -114,8 +112,6 @@ __kernel void m00040_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m00050_a0.cl b/OpenCL/m00050_a0.cl index 263ef488f..51e5e14f6 100644 --- a/OpenCL/m00050_a0.cl +++ b/OpenCL/m00050_a0.cl @@ -39,8 +39,6 @@ __kernel void m00050_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m00050_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -117,8 +113,6 @@ __kernel void m00050_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -130,8 +124,6 @@ __kernel void m00050_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00050_a1.cl b/OpenCL/m00050_a1.cl index de0739fb6..702ebebbe 100644 --- a/OpenCL/m00050_a1.cl +++ b/OpenCL/m00050_a1.cl @@ -37,8 +37,6 @@ __kernel void m00050_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m00050_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -135,8 +131,6 @@ __kernel void m00050_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -148,8 +142,6 @@ __kernel void m00050_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00050_a3.cl b/OpenCL/m00050_a3.cl index 09f223a60..209ecdb74 100644 --- a/OpenCL/m00050_a3.cl +++ b/OpenCL/m00050_a3.cl @@ -37,8 +37,6 @@ __kernel void m00050_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m00050_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -121,8 +117,6 @@ __kernel void m00050_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -134,8 +128,6 @@ __kernel void m00050_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00060_a0.cl b/OpenCL/m00060_a0.cl index ea4b96827..75369b728 100644 --- a/OpenCL/m00060_a0.cl +++ b/OpenCL/m00060_a0.cl @@ -39,8 +39,6 @@ __kernel void m00060_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m00060_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_hmac_ctx_t ctx0; @@ -119,8 +115,6 @@ __kernel void m00060_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -132,8 +126,6 @@ __kernel void m00060_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_hmac_ctx_t ctx0; diff --git a/OpenCL/m00060_a1.cl b/OpenCL/m00060_a1.cl index a0e4f0a23..6cff7c337 100644 --- a/OpenCL/m00060_a1.cl +++ b/OpenCL/m00060_a1.cl @@ -37,8 +37,6 @@ __kernel void m00060_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m00060_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_hmac_ctx_t ctx0; @@ -137,8 +133,6 @@ __kernel void m00060_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -150,8 +144,6 @@ __kernel void m00060_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_hmac_ctx_t ctx0; diff --git a/OpenCL/m00060_a3.cl b/OpenCL/m00060_a3.cl index 1c1d79a29..c1f165249 100644 --- a/OpenCL/m00060_a3.cl +++ b/OpenCL/m00060_a3.cl @@ -37,8 +37,6 @@ __kernel void m00060_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m00060_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_hmac_ctx_vector_t ctx0; @@ -123,8 +119,6 @@ __kernel void m00060_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -136,8 +130,6 @@ __kernel void m00060_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_hmac_ctx_vector_t ctx0; diff --git a/OpenCL/m00100_a0.cl b/OpenCL/m00100_a0.cl index a8bd7ec57..c14eaef04 100644 --- a/OpenCL/m00100_a0.cl +++ b/OpenCL/m00100_a0.cl @@ -39,8 +39,6 @@ __kernel void m00100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -104,8 +102,6 @@ __kernel void m00100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00100_a3.cl b/OpenCL/m00100_a3.cl index 50ecd137b..f125ff993 100644 --- a/OpenCL/m00100_a3.cl +++ b/OpenCL/m00100_a3.cl @@ -37,8 +37,6 @@ __kernel void m00100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -108,8 +106,6 @@ __kernel void m00100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00110_a0.cl b/OpenCL/m00110_a0.cl index da9c38d03..030b0ca2e 100644 --- a/OpenCL/m00110_a0.cl +++ b/OpenCL/m00110_a0.cl @@ -39,8 +39,6 @@ __kernel void m00110_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m00110_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -119,8 +115,6 @@ __kernel void m00110_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -132,8 +126,6 @@ __kernel void m00110_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00110_a1.cl b/OpenCL/m00110_a1.cl index 3d8987f72..e9a349591 100644 --- a/OpenCL/m00110_a1.cl +++ b/OpenCL/m00110_a1.cl @@ -37,8 +37,6 @@ __kernel void m00110_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; @@ -106,8 +104,6 @@ __kernel void m00110_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; diff --git a/OpenCL/m00110_a3.cl b/OpenCL/m00110_a3.cl index 00418eafc..6320923dd 100644 --- a/OpenCL/m00110_a3.cl +++ b/OpenCL/m00110_a3.cl @@ -37,8 +37,6 @@ __kernel void m00110_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m00110_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -123,8 +119,6 @@ __kernel void m00110_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -136,8 +130,6 @@ __kernel void m00110_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00120_a0.cl b/OpenCL/m00120_a0.cl index 6849a85a8..78dca9c65 100644 --- a/OpenCL/m00120_a0.cl +++ b/OpenCL/m00120_a0.cl @@ -39,8 +39,6 @@ __kernel void m00120_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; @@ -108,8 +106,6 @@ __kernel void m00120_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; diff --git a/OpenCL/m00120_a3.cl b/OpenCL/m00120_a3.cl index 178dd369e..04ee961f2 100644 --- a/OpenCL/m00120_a3.cl +++ b/OpenCL/m00120_a3.cl @@ -37,8 +37,6 @@ __kernel void m00120_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; @@ -114,8 +112,6 @@ __kernel void m00120_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; diff --git a/OpenCL/m00130_a0.cl b/OpenCL/m00130_a0.cl index ae3d7372c..859b3c2c1 100644 --- a/OpenCL/m00130_a0.cl +++ b/OpenCL/m00130_a0.cl @@ -39,8 +39,6 @@ __kernel void m00130_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m00130_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -119,8 +115,6 @@ __kernel void m00130_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -132,8 +126,6 @@ __kernel void m00130_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00130_a1.cl b/OpenCL/m00130_a1.cl index 2f7f11bd4..4d6b8e020 100644 --- a/OpenCL/m00130_a1.cl +++ b/OpenCL/m00130_a1.cl @@ -37,8 +37,6 @@ __kernel void m00130_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; @@ -106,8 +104,6 @@ __kernel void m00130_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; diff --git a/OpenCL/m00130_a3.cl b/OpenCL/m00130_a3.cl index 8024f2233..b1cf6ddcc 100644 --- a/OpenCL/m00130_a3.cl +++ b/OpenCL/m00130_a3.cl @@ -37,8 +37,6 @@ __kernel void m00130_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m00130_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -124,8 +120,6 @@ __kernel void m00130_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -137,8 +131,6 @@ __kernel void m00130_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00140_a0.cl b/OpenCL/m00140_a0.cl index 1b8c5e717..aa9c8b04b 100644 --- a/OpenCL/m00140_a0.cl +++ b/OpenCL/m00140_a0.cl @@ -39,8 +39,6 @@ __kernel void m00140_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; @@ -108,8 +106,6 @@ __kernel void m00140_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; diff --git a/OpenCL/m00140_a3.cl b/OpenCL/m00140_a3.cl index 75d22495b..91ad701ac 100644 --- a/OpenCL/m00140_a3.cl +++ b/OpenCL/m00140_a3.cl @@ -37,8 +37,6 @@ __kernel void m00140_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; @@ -114,8 +112,6 @@ __kernel void m00140_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; diff --git a/OpenCL/m00150_a0.cl b/OpenCL/m00150_a0.cl index 957d2a50b..3e4d11f88 100644 --- a/OpenCL/m00150_a0.cl +++ b/OpenCL/m00150_a0.cl @@ -39,8 +39,6 @@ __kernel void m00150_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m00150_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -117,8 +113,6 @@ __kernel void m00150_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -130,8 +124,6 @@ __kernel void m00150_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00150_a1.cl b/OpenCL/m00150_a1.cl index 96153a25c..9b55986b6 100644 --- a/OpenCL/m00150_a1.cl +++ b/OpenCL/m00150_a1.cl @@ -37,8 +37,6 @@ __kernel void m00150_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m00150_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -135,8 +131,6 @@ __kernel void m00150_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -148,8 +142,6 @@ __kernel void m00150_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00150_a3.cl b/OpenCL/m00150_a3.cl index 27c7dc7dd..195218abf 100644 --- a/OpenCL/m00150_a3.cl +++ b/OpenCL/m00150_a3.cl @@ -37,8 +37,6 @@ __kernel void m00150_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m00150_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -121,8 +117,6 @@ __kernel void m00150_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -134,8 +128,6 @@ __kernel void m00150_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00160_a0.cl b/OpenCL/m00160_a0.cl index c87a1bab7..d32cdac55 100644 --- a/OpenCL/m00160_a0.cl +++ b/OpenCL/m00160_a0.cl @@ -39,8 +39,6 @@ __kernel void m00160_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m00160_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_hmac_ctx_t ctx0; @@ -119,8 +115,6 @@ __kernel void m00160_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -132,8 +126,6 @@ __kernel void m00160_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_hmac_ctx_t ctx0; diff --git a/OpenCL/m00160_a1.cl b/OpenCL/m00160_a1.cl index 6a4907c55..b8329a326 100644 --- a/OpenCL/m00160_a1.cl +++ b/OpenCL/m00160_a1.cl @@ -37,8 +37,6 @@ __kernel void m00160_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m00160_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_hmac_ctx_t ctx0; @@ -137,8 +133,6 @@ __kernel void m00160_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -150,8 +144,6 @@ __kernel void m00160_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_hmac_ctx_t ctx0; diff --git a/OpenCL/m00160_a3.cl b/OpenCL/m00160_a3.cl index cdcca3d64..1712e8064 100644 --- a/OpenCL/m00160_a3.cl +++ b/OpenCL/m00160_a3.cl @@ -37,8 +37,6 @@ __kernel void m00160_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m00160_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_hmac_ctx_vector_t ctx0; @@ -123,8 +119,6 @@ __kernel void m00160_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -136,8 +130,6 @@ __kernel void m00160_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_hmac_ctx_vector_t ctx0; diff --git a/OpenCL/m00300_a0.cl b/OpenCL/m00300_a0.cl index 04e7f4940..d05d17076 100644 --- a/OpenCL/m00300_a0.cl +++ b/OpenCL/m00300_a0.cl @@ -39,8 +39,6 @@ __kernel void m00300_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -127,8 +125,6 @@ __kernel void m00300_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00300_a3.cl b/OpenCL/m00300_a3.cl index 6a9763f3f..d68b2608c 100644 --- a/OpenCL/m00300_a3.cl +++ b/OpenCL/m00300_a3.cl @@ -37,8 +37,6 @@ __kernel void m00300_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -131,8 +129,6 @@ __kernel void m00300_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00400.cl b/OpenCL/m00400.cl index 821e4a02a..5d3ad84b4 100644 --- a/OpenCL/m00400.cl +++ b/OpenCL/m00400.cl @@ -76,8 +76,6 @@ __kernel void m00400_loop (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } u32 digest[4]; diff --git a/OpenCL/m00500.cl b/OpenCL/m00500.cl index 54422ec96..80371e61d 100644 --- a/OpenCL/m00500.cl +++ b/OpenCL/m00500.cl @@ -40,8 +40,6 @@ __kernel void m00500_init (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -53,8 +51,6 @@ __kernel void m00500_init (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -154,8 +150,6 @@ __kernel void m00500_loop (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -167,8 +161,6 @@ __kernel void m00500_loop (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00900_a0.cl b/OpenCL/m00900_a0.cl index cf19e1c05..a58b5f934 100644 --- a/OpenCL/m00900_a0.cl +++ b/OpenCL/m00900_a0.cl @@ -39,8 +39,6 @@ __kernel void m00900_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -104,8 +102,6 @@ __kernel void m00900_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m00900_a3.cl b/OpenCL/m00900_a3.cl index 480a1e3c4..52ab4da60 100644 --- a/OpenCL/m00900_a3.cl +++ b/OpenCL/m00900_a3.cl @@ -37,8 +37,6 @@ __kernel void m00900_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -108,8 +106,6 @@ __kernel void m00900_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01000_a0.cl b/OpenCL/m01000_a0.cl index d01ddc0d5..c28b005e6 100644 --- a/OpenCL/m01000_a0.cl +++ b/OpenCL/m01000_a0.cl @@ -39,8 +39,6 @@ __kernel void m01000_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -104,8 +102,6 @@ __kernel void m01000_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01000_a3.cl b/OpenCL/m01000_a3.cl index a9a421686..d4048fffb 100644 --- a/OpenCL/m01000_a3.cl +++ b/OpenCL/m01000_a3.cl @@ -37,8 +37,6 @@ __kernel void m01000_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -108,8 +106,6 @@ __kernel void m01000_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01100_a0.cl b/OpenCL/m01100_a0.cl index 085e29e18..9dc3d1638 100644 --- a/OpenCL/m01100_a0.cl +++ b/OpenCL/m01100_a0.cl @@ -39,8 +39,6 @@ __kernel void m01100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m01100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -132,8 +128,6 @@ __kernel void m01100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -145,8 +139,6 @@ __kernel void m01100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01100_a1.cl b/OpenCL/m01100_a1.cl index 6c7ef2d9e..893122399 100644 --- a/OpenCL/m01100_a1.cl +++ b/OpenCL/m01100_a1.cl @@ -37,8 +37,6 @@ __kernel void m01100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md4_ctx_t ctx0; @@ -119,8 +117,6 @@ __kernel void m01100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md4_ctx_t ctx0; diff --git a/OpenCL/m01100_a3.cl b/OpenCL/m01100_a3.cl index 461b834a8..03e59c812 100644 --- a/OpenCL/m01100_a3.cl +++ b/OpenCL/m01100_a3.cl @@ -37,8 +37,6 @@ __kernel void m01100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m01100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -136,8 +132,6 @@ __kernel void m01100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -149,8 +143,6 @@ __kernel void m01100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01300_a0.cl b/OpenCL/m01300_a0.cl index 7607c95f8..588e58786 100644 --- a/OpenCL/m01300_a0.cl +++ b/OpenCL/m01300_a0.cl @@ -39,8 +39,6 @@ __kernel void m01300_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -104,8 +102,6 @@ __kernel void m01300_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01300_a3.cl b/OpenCL/m01300_a3.cl index cbb1203a8..d756fd264 100644 --- a/OpenCL/m01300_a3.cl +++ b/OpenCL/m01300_a3.cl @@ -37,8 +37,6 @@ __kernel void m01300_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -108,8 +106,6 @@ __kernel void m01300_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01400_a0.cl b/OpenCL/m01400_a0.cl index 3f07773af..b26194a63 100644 --- a/OpenCL/m01400_a0.cl +++ b/OpenCL/m01400_a0.cl @@ -39,8 +39,6 @@ __kernel void m01400_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -104,8 +102,6 @@ __kernel void m01400_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01400_a3.cl b/OpenCL/m01400_a3.cl index 6595c7b9a..3cba71a25 100644 --- a/OpenCL/m01400_a3.cl +++ b/OpenCL/m01400_a3.cl @@ -37,8 +37,6 @@ __kernel void m01400_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -108,8 +106,6 @@ __kernel void m01400_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01410_a0.cl b/OpenCL/m01410_a0.cl index 185c9e132..b517e02bd 100644 --- a/OpenCL/m01410_a0.cl +++ b/OpenCL/m01410_a0.cl @@ -39,8 +39,6 @@ __kernel void m01410_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m01410_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -119,8 +115,6 @@ __kernel void m01410_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -132,8 +126,6 @@ __kernel void m01410_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01410_a1.cl b/OpenCL/m01410_a1.cl index bf7a01885..b7d22426c 100644 --- a/OpenCL/m01410_a1.cl +++ b/OpenCL/m01410_a1.cl @@ -37,8 +37,6 @@ __kernel void m01410_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_ctx_t ctx0; @@ -106,8 +104,6 @@ __kernel void m01410_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_ctx_t ctx0; diff --git a/OpenCL/m01410_a3.cl b/OpenCL/m01410_a3.cl index a1a7bd150..dd860cce1 100644 --- a/OpenCL/m01410_a3.cl +++ b/OpenCL/m01410_a3.cl @@ -37,8 +37,6 @@ __kernel void m01410_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m01410_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -123,8 +119,6 @@ __kernel void m01410_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -136,8 +130,6 @@ __kernel void m01410_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01420_a0.cl b/OpenCL/m01420_a0.cl index 4733245f6..6adc7367f 100644 --- a/OpenCL/m01420_a0.cl +++ b/OpenCL/m01420_a0.cl @@ -39,8 +39,6 @@ __kernel void m01420_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_ctx_t ctx0; @@ -108,8 +106,6 @@ __kernel void m01420_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_ctx_t ctx0; diff --git a/OpenCL/m01420_a3.cl b/OpenCL/m01420_a3.cl index 02784ecd0..11526045b 100644 --- a/OpenCL/m01420_a3.cl +++ b/OpenCL/m01420_a3.cl @@ -37,8 +37,6 @@ __kernel void m01420_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_ctx_t ctx0; @@ -114,8 +112,6 @@ __kernel void m01420_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_ctx_t ctx0; diff --git a/OpenCL/m01430_a0.cl b/OpenCL/m01430_a0.cl index 45ef46ba9..e3236eef1 100644 --- a/OpenCL/m01430_a0.cl +++ b/OpenCL/m01430_a0.cl @@ -39,8 +39,6 @@ __kernel void m01430_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m01430_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -119,8 +115,6 @@ __kernel void m01430_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -132,8 +126,6 @@ __kernel void m01430_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01430_a1.cl b/OpenCL/m01430_a1.cl index 2d0efa682..8eef1f177 100644 --- a/OpenCL/m01430_a1.cl +++ b/OpenCL/m01430_a1.cl @@ -37,8 +37,6 @@ __kernel void m01430_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_ctx_t ctx0; @@ -106,8 +104,6 @@ __kernel void m01430_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_ctx_t ctx0; diff --git a/OpenCL/m01430_a3.cl b/OpenCL/m01430_a3.cl index 2a6ccc3f8..3259e7c17 100644 --- a/OpenCL/m01430_a3.cl +++ b/OpenCL/m01430_a3.cl @@ -37,8 +37,6 @@ __kernel void m01430_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m01430_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -124,8 +120,6 @@ __kernel void m01430_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -137,8 +131,6 @@ __kernel void m01430_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01440_a0.cl b/OpenCL/m01440_a0.cl index 4dcf235e0..271d4367e 100644 --- a/OpenCL/m01440_a0.cl +++ b/OpenCL/m01440_a0.cl @@ -39,8 +39,6 @@ __kernel void m01440_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_ctx_t ctx0; @@ -108,8 +106,6 @@ __kernel void m01440_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_ctx_t ctx0; diff --git a/OpenCL/m01440_a3.cl b/OpenCL/m01440_a3.cl index a01604ac3..316e0d156 100644 --- a/OpenCL/m01440_a3.cl +++ b/OpenCL/m01440_a3.cl @@ -37,8 +37,6 @@ __kernel void m01440_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_ctx_t ctx0; @@ -114,8 +112,6 @@ __kernel void m01440_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_ctx_t ctx0; diff --git a/OpenCL/m01450_a0.cl b/OpenCL/m01450_a0.cl index 21fa01b0c..e1462f303 100644 --- a/OpenCL/m01450_a0.cl +++ b/OpenCL/m01450_a0.cl @@ -39,8 +39,6 @@ __kernel void m01450_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m01450_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -117,8 +113,6 @@ __kernel void m01450_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -130,8 +124,6 @@ __kernel void m01450_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01450_a1.cl b/OpenCL/m01450_a1.cl index 25e81d266..7f7c9f635 100644 --- a/OpenCL/m01450_a1.cl +++ b/OpenCL/m01450_a1.cl @@ -37,8 +37,6 @@ __kernel void m01450_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m01450_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -135,8 +131,6 @@ __kernel void m01450_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -148,8 +142,6 @@ __kernel void m01450_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01450_a3.cl b/OpenCL/m01450_a3.cl index c55b53eae..35bd59a2a 100644 --- a/OpenCL/m01450_a3.cl +++ b/OpenCL/m01450_a3.cl @@ -37,8 +37,6 @@ __kernel void m01450_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m01450_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -121,8 +117,6 @@ __kernel void m01450_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -134,8 +128,6 @@ __kernel void m01450_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01460_a0.cl b/OpenCL/m01460_a0.cl index 0b14e52e9..d51c28cb6 100644 --- a/OpenCL/m01460_a0.cl +++ b/OpenCL/m01460_a0.cl @@ -39,8 +39,6 @@ __kernel void m01460_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m01460_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_hmac_ctx_t ctx0; @@ -119,8 +115,6 @@ __kernel void m01460_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -132,8 +126,6 @@ __kernel void m01460_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_hmac_ctx_t ctx0; diff --git a/OpenCL/m01460_a1.cl b/OpenCL/m01460_a1.cl index d4b0fdb8b..61c30c095 100644 --- a/OpenCL/m01460_a1.cl +++ b/OpenCL/m01460_a1.cl @@ -37,8 +37,6 @@ __kernel void m01460_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m01460_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_hmac_ctx_t ctx0; @@ -137,8 +133,6 @@ __kernel void m01460_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -150,8 +144,6 @@ __kernel void m01460_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_hmac_ctx_t ctx0; diff --git a/OpenCL/m01460_a3.cl b/OpenCL/m01460_a3.cl index 646ed4732..d7ac9bedf 100644 --- a/OpenCL/m01460_a3.cl +++ b/OpenCL/m01460_a3.cl @@ -37,8 +37,6 @@ __kernel void m01460_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m01460_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_hmac_ctx_vector_t ctx0; @@ -123,8 +119,6 @@ __kernel void m01460_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -136,8 +130,6 @@ __kernel void m01460_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha256_hmac_ctx_vector_t ctx0; diff --git a/OpenCL/m01600.cl b/OpenCL/m01600.cl index dc110a813..bdc7e5334 100644 --- a/OpenCL/m01600.cl +++ b/OpenCL/m01600.cl @@ -41,8 +41,6 @@ __kernel void m01600_init (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -54,8 +52,6 @@ __kernel void m01600_init (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -156,8 +152,6 @@ __kernel void m01600_loop (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -169,8 +163,6 @@ __kernel void m01600_loop (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01700_a0.cl b/OpenCL/m01700_a0.cl index 01649a999..72e3f0f2b 100644 --- a/OpenCL/m01700_a0.cl +++ b/OpenCL/m01700_a0.cl @@ -39,8 +39,6 @@ __kernel void m01700_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -104,8 +102,6 @@ __kernel void m01700_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01700_a3.cl b/OpenCL/m01700_a3.cl index e35083c8b..80f66afb2 100644 --- a/OpenCL/m01700_a3.cl +++ b/OpenCL/m01700_a3.cl @@ -37,8 +37,6 @@ __kernel void m01700_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -108,8 +106,6 @@ __kernel void m01700_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01710_a0.cl b/OpenCL/m01710_a0.cl index 593be6b7d..acd32a0f3 100644 --- a/OpenCL/m01710_a0.cl +++ b/OpenCL/m01710_a0.cl @@ -39,8 +39,6 @@ __kernel void m01710_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m01710_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -119,8 +115,6 @@ __kernel void m01710_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -132,8 +126,6 @@ __kernel void m01710_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01710_a1.cl b/OpenCL/m01710_a1.cl index 485adbb32..b09354e90 100644 --- a/OpenCL/m01710_a1.cl +++ b/OpenCL/m01710_a1.cl @@ -37,8 +37,6 @@ __kernel void m01710_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_ctx_t ctx0; @@ -106,8 +104,6 @@ __kernel void m01710_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_ctx_t ctx0; diff --git a/OpenCL/m01710_a3.cl b/OpenCL/m01710_a3.cl index 6da78255b..f54ce7355 100644 --- a/OpenCL/m01710_a3.cl +++ b/OpenCL/m01710_a3.cl @@ -37,8 +37,6 @@ __kernel void m01710_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m01710_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -123,8 +119,6 @@ __kernel void m01710_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -136,8 +130,6 @@ __kernel void m01710_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01720_a0.cl b/OpenCL/m01720_a0.cl index adcf3501b..e80c9426c 100644 --- a/OpenCL/m01720_a0.cl +++ b/OpenCL/m01720_a0.cl @@ -39,8 +39,6 @@ __kernel void m01720_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_ctx_t ctx0; @@ -108,8 +106,6 @@ __kernel void m01720_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_ctx_t ctx0; diff --git a/OpenCL/m01720_a3.cl b/OpenCL/m01720_a3.cl index e6bb100d3..b28688280 100644 --- a/OpenCL/m01720_a3.cl +++ b/OpenCL/m01720_a3.cl @@ -37,8 +37,6 @@ __kernel void m01720_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_ctx_t ctx0; @@ -114,8 +112,6 @@ __kernel void m01720_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_ctx_t ctx0; diff --git a/OpenCL/m01730_a0.cl b/OpenCL/m01730_a0.cl index f4a8951a8..16a6753a7 100644 --- a/OpenCL/m01730_a0.cl +++ b/OpenCL/m01730_a0.cl @@ -39,8 +39,6 @@ __kernel void m01730_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m01730_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -119,8 +115,6 @@ __kernel void m01730_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -132,8 +126,6 @@ __kernel void m01730_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01730_a1.cl b/OpenCL/m01730_a1.cl index 7529dd71d..a937af113 100644 --- a/OpenCL/m01730_a1.cl +++ b/OpenCL/m01730_a1.cl @@ -37,8 +37,6 @@ __kernel void m01730_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_ctx_t ctx0; @@ -106,8 +104,6 @@ __kernel void m01730_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_ctx_t ctx0; diff --git a/OpenCL/m01730_a3.cl b/OpenCL/m01730_a3.cl index c408f8105..90d53fd56 100644 --- a/OpenCL/m01730_a3.cl +++ b/OpenCL/m01730_a3.cl @@ -37,8 +37,6 @@ __kernel void m01730_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m01730_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -124,8 +120,6 @@ __kernel void m01730_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -137,8 +131,6 @@ __kernel void m01730_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01740_a0.cl b/OpenCL/m01740_a0.cl index d489d6e67..0c76ab222 100644 --- a/OpenCL/m01740_a0.cl +++ b/OpenCL/m01740_a0.cl @@ -39,8 +39,6 @@ __kernel void m01740_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_ctx_t ctx0; @@ -108,8 +106,6 @@ __kernel void m01740_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_ctx_t ctx0; diff --git a/OpenCL/m01740_a3.cl b/OpenCL/m01740_a3.cl index 6ae09ef93..0e879be01 100644 --- a/OpenCL/m01740_a3.cl +++ b/OpenCL/m01740_a3.cl @@ -37,8 +37,6 @@ __kernel void m01740_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_ctx_t ctx0; @@ -114,8 +112,6 @@ __kernel void m01740_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_ctx_t ctx0; diff --git a/OpenCL/m01750_a0.cl b/OpenCL/m01750_a0.cl index 0ff9c0346..b3e12bff9 100644 --- a/OpenCL/m01750_a0.cl +++ b/OpenCL/m01750_a0.cl @@ -39,8 +39,6 @@ __kernel void m01750_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m01750_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -117,8 +113,6 @@ __kernel void m01750_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -130,8 +124,6 @@ __kernel void m01750_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01750_a1.cl b/OpenCL/m01750_a1.cl index d6d9da0fd..430d9a7ce 100644 --- a/OpenCL/m01750_a1.cl +++ b/OpenCL/m01750_a1.cl @@ -37,8 +37,6 @@ __kernel void m01750_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m01750_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -135,8 +131,6 @@ __kernel void m01750_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -148,8 +142,6 @@ __kernel void m01750_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01750_a3.cl b/OpenCL/m01750_a3.cl index 28c6a4b38..a9c59d030 100644 --- a/OpenCL/m01750_a3.cl +++ b/OpenCL/m01750_a3.cl @@ -37,8 +37,6 @@ __kernel void m01750_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m01750_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -121,8 +117,6 @@ __kernel void m01750_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -134,8 +128,6 @@ __kernel void m01750_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m01760_a0.cl b/OpenCL/m01760_a0.cl index f977db7ac..460e1cb3f 100644 --- a/OpenCL/m01760_a0.cl +++ b/OpenCL/m01760_a0.cl @@ -39,8 +39,6 @@ __kernel void m01760_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m01760_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_hmac_ctx_t ctx0; @@ -119,8 +115,6 @@ __kernel void m01760_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -132,8 +126,6 @@ __kernel void m01760_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_hmac_ctx_t ctx0; diff --git a/OpenCL/m01760_a1.cl b/OpenCL/m01760_a1.cl index 5403bcc5d..1a558b84f 100644 --- a/OpenCL/m01760_a1.cl +++ b/OpenCL/m01760_a1.cl @@ -37,8 +37,6 @@ __kernel void m01760_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m01760_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_hmac_ctx_t ctx0; @@ -137,8 +133,6 @@ __kernel void m01760_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -150,8 +144,6 @@ __kernel void m01760_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_hmac_ctx_t ctx0; diff --git a/OpenCL/m01760_a3.cl b/OpenCL/m01760_a3.cl index f3b80a484..4bb4c0d78 100644 --- a/OpenCL/m01760_a3.cl +++ b/OpenCL/m01760_a3.cl @@ -37,8 +37,6 @@ __kernel void m01760_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m01760_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_hmac_ctx_vector_t ctx0; @@ -123,8 +119,6 @@ __kernel void m01760_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -136,8 +130,6 @@ __kernel void m01760_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha512_hmac_ctx_vector_t ctx0; diff --git a/OpenCL/m01800.cl b/OpenCL/m01800.cl index 46270e3dc..02360d45e 100644 --- a/OpenCL/m01800.cl +++ b/OpenCL/m01800.cl @@ -36,8 +36,6 @@ __kernel void m01800_init (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } for (int idx = 0; idx < pw_lenv; idx++) @@ -54,8 +52,6 @@ __kernel void m01800_init (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } for (int idx = 0; idx < salt_lenv; idx++) diff --git a/OpenCL/m02610_a0.cl b/OpenCL/m02610_a0.cl index a09b5b41e..d86d026c8 100644 --- a/OpenCL/m02610_a0.cl +++ b/OpenCL/m02610_a0.cl @@ -69,8 +69,6 @@ __kernel void m02610_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -82,8 +80,6 @@ __kernel void m02610_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -197,8 +193,6 @@ __kernel void m02610_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -210,8 +204,6 @@ __kernel void m02610_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m02610_a1.cl b/OpenCL/m02610_a1.cl index 3fbdd72f9..0cfd702fa 100644 --- a/OpenCL/m02610_a1.cl +++ b/OpenCL/m02610_a1.cl @@ -67,8 +67,6 @@ __kernel void m02610_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -184,8 +182,6 @@ __kernel void m02610_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m02610_a3.cl b/OpenCL/m02610_a3.cl index 442dfe070..441ec66fd 100644 --- a/OpenCL/m02610_a3.cl +++ b/OpenCL/m02610_a3.cl @@ -67,8 +67,6 @@ __kernel void m02610_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -80,8 +78,6 @@ __kernel void m02610_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -201,8 +197,6 @@ __kernel void m02610_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -214,8 +208,6 @@ __kernel void m02610_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m02810_a0.cl b/OpenCL/m02810_a0.cl index 502105952..c1e047201 100644 --- a/OpenCL/m02810_a0.cl +++ b/OpenCL/m02810_a0.cl @@ -69,8 +69,6 @@ __kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = 32; @@ -82,8 +80,6 @@ __kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf_pc[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -220,8 +216,6 @@ __kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = 32; @@ -233,8 +227,6 @@ __kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf_pc[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m02810_a1.cl b/OpenCL/m02810_a1.cl index df010bf8d..c14141814 100644 --- a/OpenCL/m02810_a1.cl +++ b/OpenCL/m02810_a1.cl @@ -67,8 +67,6 @@ __kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf_pc[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -207,8 +205,6 @@ __kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf_pc[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m02810_a3.cl b/OpenCL/m02810_a3.cl index f46abd13c..5cd8bbc94 100644 --- a/OpenCL/m02810_a3.cl +++ b/OpenCL/m02810_a3.cl @@ -67,8 +67,6 @@ __kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = 32; @@ -80,8 +78,6 @@ __kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf_pc[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -224,8 +220,6 @@ __kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = 32; @@ -237,8 +231,6 @@ __kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf_pc[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m03710_a0.cl b/OpenCL/m03710_a0.cl index 89b894741..1401ad716 100644 --- a/OpenCL/m03710_a0.cl +++ b/OpenCL/m03710_a0.cl @@ -69,8 +69,6 @@ __kernel void m03710_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -82,8 +80,6 @@ __kernel void m03710_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -210,8 +206,6 @@ __kernel void m03710_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -223,8 +217,6 @@ __kernel void m03710_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m03710_a1.cl b/OpenCL/m03710_a1.cl index 16e7caa5e..18ed573ad 100644 --- a/OpenCL/m03710_a1.cl +++ b/OpenCL/m03710_a1.cl @@ -67,8 +67,6 @@ __kernel void m03710_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -197,8 +195,6 @@ __kernel void m03710_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m03710_a3.cl b/OpenCL/m03710_a3.cl index 8c35c9263..e17da66f9 100644 --- a/OpenCL/m03710_a3.cl +++ b/OpenCL/m03710_a3.cl @@ -67,8 +67,6 @@ __kernel void m03710_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -80,8 +78,6 @@ __kernel void m03710_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -214,8 +210,6 @@ __kernel void m03710_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -227,8 +221,6 @@ __kernel void m03710_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m03800_a0.cl b/OpenCL/m03800_a0.cl index 42344114b..8608dc55c 100644 --- a/OpenCL/m03800_a0.cl +++ b/OpenCL/m03800_a0.cl @@ -39,8 +39,6 @@ __kernel void m03800_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m03800_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -123,8 +119,6 @@ __kernel void m03800_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -136,8 +130,6 @@ __kernel void m03800_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m03800_a1.cl b/OpenCL/m03800_a1.cl index c594eee3d..31007631a 100644 --- a/OpenCL/m03800_a1.cl +++ b/OpenCL/m03800_a1.cl @@ -37,8 +37,6 @@ __kernel void m03800_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -108,8 +106,6 @@ __kernel void m03800_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m03800_a3.cl b/OpenCL/m03800_a3.cl index 00275dbeb..5736596bc 100644 --- a/OpenCL/m03800_a3.cl +++ b/OpenCL/m03800_a3.cl @@ -37,8 +37,6 @@ __kernel void m03800_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m03800_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -129,8 +125,6 @@ __kernel void m03800_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -142,8 +136,6 @@ __kernel void m03800_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m03910_a0.cl b/OpenCL/m03910_a0.cl index 1139e0cb6..a296705c5 100644 --- a/OpenCL/m03910_a0.cl +++ b/OpenCL/m03910_a0.cl @@ -69,8 +69,6 @@ __kernel void m03910_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = 32; @@ -82,8 +80,6 @@ __kernel void m03910_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf_pc[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -220,8 +216,6 @@ __kernel void m03910_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = 32; @@ -233,8 +227,6 @@ __kernel void m03910_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf_pc[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m03910_a1.cl b/OpenCL/m03910_a1.cl index 40c9071dc..1a082b9df 100644 --- a/OpenCL/m03910_a1.cl +++ b/OpenCL/m03910_a1.cl @@ -67,8 +67,6 @@ __kernel void m03910_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf_pc[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -207,8 +205,6 @@ __kernel void m03910_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf_pc[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m03910_a3.cl b/OpenCL/m03910_a3.cl index 48e7ae215..95ca4a3f3 100644 --- a/OpenCL/m03910_a3.cl +++ b/OpenCL/m03910_a3.cl @@ -67,8 +67,6 @@ __kernel void m03910_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = 32; @@ -80,8 +78,6 @@ __kernel void m03910_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf_pc[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -224,8 +220,6 @@ __kernel void m03910_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = 32; @@ -237,8 +231,6 @@ __kernel void m03910_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf_pc[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m04010_a0.cl b/OpenCL/m04010_a0.cl index 454997290..e1365e632 100644 --- a/OpenCL/m04010_a0.cl +++ b/OpenCL/m04010_a0.cl @@ -69,8 +69,6 @@ __kernel void m04010_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -197,8 +195,6 @@ __kernel void m04010_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m04010_a3.cl b/OpenCL/m04010_a3.cl index 4822b707b..91b54e48f 100644 --- a/OpenCL/m04010_a3.cl +++ b/OpenCL/m04010_a3.cl @@ -67,8 +67,6 @@ __kernel void m04010_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -205,8 +203,6 @@ __kernel void m04010_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m04110_a0.cl b/OpenCL/m04110_a0.cl index 4bcb4def6..ed85497a3 100644 --- a/OpenCL/m04110_a0.cl +++ b/OpenCL/m04110_a0.cl @@ -69,8 +69,6 @@ __kernel void m04110_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -82,8 +80,6 @@ __kernel void m04110_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -214,8 +210,6 @@ __kernel void m04110_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -227,8 +221,6 @@ __kernel void m04110_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m04110_a1.cl b/OpenCL/m04110_a1.cl index 57684942a..5bce915db 100644 --- a/OpenCL/m04110_a1.cl +++ b/OpenCL/m04110_a1.cl @@ -67,8 +67,6 @@ __kernel void m04110_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -201,8 +199,6 @@ __kernel void m04110_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m04110_a3.cl b/OpenCL/m04110_a3.cl index f466a3178..57ec737f4 100644 --- a/OpenCL/m04110_a3.cl +++ b/OpenCL/m04110_a3.cl @@ -67,8 +67,6 @@ __kernel void m04110_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -80,8 +78,6 @@ __kernel void m04110_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -220,8 +216,6 @@ __kernel void m04110_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -233,8 +227,6 @@ __kernel void m04110_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m04310_a0.cl b/OpenCL/m04310_a0.cl index 31d1af318..6e2ac764e 100644 --- a/OpenCL/m04310_a0.cl +++ b/OpenCL/m04310_a0.cl @@ -69,8 +69,6 @@ __kernel void m04310_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -82,8 +80,6 @@ __kernel void m04310_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -197,8 +193,6 @@ __kernel void m04310_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -210,8 +204,6 @@ __kernel void m04310_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m04310_a1.cl b/OpenCL/m04310_a1.cl index bf33a9155..3ed91424c 100644 --- a/OpenCL/m04310_a1.cl +++ b/OpenCL/m04310_a1.cl @@ -67,8 +67,6 @@ __kernel void m04310_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -184,8 +182,6 @@ __kernel void m04310_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m04310_a3.cl b/OpenCL/m04310_a3.cl index 2f491a2fb..e265cf4a1 100644 --- a/OpenCL/m04310_a3.cl +++ b/OpenCL/m04310_a3.cl @@ -67,8 +67,6 @@ __kernel void m04310_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -80,8 +78,6 @@ __kernel void m04310_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -201,8 +197,6 @@ __kernel void m04310_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -214,8 +208,6 @@ __kernel void m04310_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m04400_a0.cl b/OpenCL/m04400_a0.cl index 877fee359..7d583393c 100644 --- a/OpenCL/m04400_a0.cl +++ b/OpenCL/m04400_a0.cl @@ -70,8 +70,6 @@ __kernel void m04400_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -188,8 +186,6 @@ __kernel void m04400_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m04400_a3.cl b/OpenCL/m04400_a3.cl index 598de7915..8439a874d 100644 --- a/OpenCL/m04400_a3.cl +++ b/OpenCL/m04400_a3.cl @@ -68,8 +68,6 @@ __kernel void m04400_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -192,8 +190,6 @@ __kernel void m04400_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m04500_a0.cl b/OpenCL/m04500_a0.cl index 5f2d0d62e..35d111c0e 100644 --- a/OpenCL/m04500_a0.cl +++ b/OpenCL/m04500_a0.cl @@ -69,8 +69,6 @@ __kernel void m04500_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -187,8 +185,6 @@ __kernel void m04500_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m04500_a3.cl b/OpenCL/m04500_a3.cl index c14543397..cea19b72f 100644 --- a/OpenCL/m04500_a3.cl +++ b/OpenCL/m04500_a3.cl @@ -67,8 +67,6 @@ __kernel void m04500_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -191,8 +189,6 @@ __kernel void m04500_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m04520_a0.cl b/OpenCL/m04520_a0.cl index 5ac22b8ac..bf8924407 100644 --- a/OpenCL/m04520_a0.cl +++ b/OpenCL/m04520_a0.cl @@ -69,8 +69,6 @@ __kernel void m04520_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; @@ -202,8 +200,6 @@ __kernel void m04520_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; diff --git a/OpenCL/m04520_a3.cl b/OpenCL/m04520_a3.cl index dd633dda1..ff9a71e82 100644 --- a/OpenCL/m04520_a3.cl +++ b/OpenCL/m04520_a3.cl @@ -67,8 +67,6 @@ __kernel void m04520_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; @@ -208,8 +206,6 @@ __kernel void m04520_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; diff --git a/OpenCL/m04700_a0.cl b/OpenCL/m04700_a0.cl index bb2374eaf..611834f5d 100644 --- a/OpenCL/m04700_a0.cl +++ b/OpenCL/m04700_a0.cl @@ -70,8 +70,6 @@ __kernel void m04700_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -184,8 +182,6 @@ __kernel void m04700_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m04700_a3.cl b/OpenCL/m04700_a3.cl index 9b00733ed..9211ec451 100644 --- a/OpenCL/m04700_a3.cl +++ b/OpenCL/m04700_a3.cl @@ -68,8 +68,6 @@ __kernel void m04700_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -187,8 +185,6 @@ __kernel void m04700_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m04800_a0.cl b/OpenCL/m04800_a0.cl index cd7c9cc07..fea15ad37 100644 --- a/OpenCL/m04800_a0.cl +++ b/OpenCL/m04800_a0.cl @@ -39,8 +39,6 @@ __kernel void m04800_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len - 1; @@ -121,8 +119,6 @@ __kernel void m04800_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len - 1; diff --git a/OpenCL/m04800_a3.cl b/OpenCL/m04800_a3.cl index e69628fbb..ed25c642c 100644 --- a/OpenCL/m04800_a3.cl +++ b/OpenCL/m04800_a3.cl @@ -37,8 +37,6 @@ __kernel void m04800_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len - 1; @@ -127,8 +125,6 @@ __kernel void m04800_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len - 1; diff --git a/OpenCL/m04900_a0.cl b/OpenCL/m04900_a0.cl index b323e6e6e..b1332a3de 100644 --- a/OpenCL/m04900_a0.cl +++ b/OpenCL/m04900_a0.cl @@ -39,8 +39,6 @@ __kernel void m04900_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m04900_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; @@ -123,8 +119,6 @@ __kernel void m04900_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -136,8 +130,6 @@ __kernel void m04900_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; diff --git a/OpenCL/m04900_a1.cl b/OpenCL/m04900_a1.cl index a13057b0e..1a05735e3 100644 --- a/OpenCL/m04900_a1.cl +++ b/OpenCL/m04900_a1.cl @@ -37,8 +37,6 @@ __kernel void m04900_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; @@ -108,8 +106,6 @@ __kernel void m04900_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; diff --git a/OpenCL/m04900_a3.cl b/OpenCL/m04900_a3.cl index 6b365a6f8..3f9668d61 100644 --- a/OpenCL/m04900_a3.cl +++ b/OpenCL/m04900_a3.cl @@ -37,8 +37,6 @@ __kernel void m04900_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m04900_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32 (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; @@ -129,8 +125,6 @@ __kernel void m04900_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -142,8 +136,6 @@ __kernel void m04900_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32 (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; diff --git a/OpenCL/m05100_a0.cl b/OpenCL/m05100_a0.cl index 9da9df5b7..bbaf71b68 100644 --- a/OpenCL/m05100_a0.cl +++ b/OpenCL/m05100_a0.cl @@ -39,8 +39,6 @@ __kernel void m05100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -108,8 +106,6 @@ __kernel void m05100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m05100_a3.cl b/OpenCL/m05100_a3.cl index e8a25bdbc..3fcb567c9 100644 --- a/OpenCL/m05100_a3.cl +++ b/OpenCL/m05100_a3.cl @@ -37,8 +37,6 @@ __kernel void m05100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -112,8 +110,6 @@ __kernel void m05100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m05300_a0.cl b/OpenCL/m05300_a0.cl index 6c17f8865..552785041 100644 --- a/OpenCL/m05300_a0.cl +++ b/OpenCL/m05300_a0.cl @@ -39,8 +39,6 @@ __kernel void m05300_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -134,8 +132,6 @@ __kernel void m05300_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m05300_a1.cl b/OpenCL/m05300_a1.cl index aedad50e9..e96b61ba2 100644 --- a/OpenCL/m05300_a1.cl +++ b/OpenCL/m05300_a1.cl @@ -37,8 +37,6 @@ __kernel void m05300_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -152,8 +150,6 @@ __kernel void m05300_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m05300_a3.cl b/OpenCL/m05300_a3.cl index e42db4387..80891631c 100644 --- a/OpenCL/m05300_a3.cl +++ b/OpenCL/m05300_a3.cl @@ -37,8 +37,6 @@ __kernel void m05300_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -138,8 +136,6 @@ __kernel void m05300_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m05400_a0.cl b/OpenCL/m05400_a0.cl index 97d759c03..252e18f61 100644 --- a/OpenCL/m05400_a0.cl +++ b/OpenCL/m05400_a0.cl @@ -39,8 +39,6 @@ __kernel void m05400_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -134,8 +132,6 @@ __kernel void m05400_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m05400_a1.cl b/OpenCL/m05400_a1.cl index 74bd00d06..8aa75bef2 100644 --- a/OpenCL/m05400_a1.cl +++ b/OpenCL/m05400_a1.cl @@ -37,8 +37,6 @@ __kernel void m05400_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -152,8 +150,6 @@ __kernel void m05400_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m05400_a3.cl b/OpenCL/m05400_a3.cl index 92c014049..e40fd0cbd 100644 --- a/OpenCL/m05400_a3.cl +++ b/OpenCL/m05400_a3.cl @@ -37,8 +37,6 @@ __kernel void m05400_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -138,8 +136,6 @@ __kernel void m05400_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m05500_a0.cl b/OpenCL/m05500_a0.cl index 3780e2e1b..d10338f6c 100644 --- a/OpenCL/m05500_a0.cl +++ b/OpenCL/m05500_a0.cl @@ -553,8 +553,6 @@ __kernel void m05500_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -698,8 +696,6 @@ __kernel void m05500_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m05500_a3.cl b/OpenCL/m05500_a3.cl index 261e49457..192e3c2a7 100644 --- a/OpenCL/m05500_a3.cl +++ b/OpenCL/m05500_a3.cl @@ -550,8 +550,6 @@ __kernel void m05500_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -701,8 +699,6 @@ __kernel void m05500_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m05600_a0.cl b/OpenCL/m05600_a0.cl index d839d314d..1c2ac82bf 100644 --- a/OpenCL/m05600_a0.cl +++ b/OpenCL/m05600_a0.cl @@ -40,8 +40,6 @@ __kernel void m05600_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -160,8 +158,6 @@ __kernel void m05600_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m05600_a3.cl b/OpenCL/m05600_a3.cl index c963a3d66..7ec0e36a9 100644 --- a/OpenCL/m05600_a3.cl +++ b/OpenCL/m05600_a3.cl @@ -38,8 +38,6 @@ __kernel void m05600_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -164,8 +162,6 @@ __kernel void m05600_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m05800.cl b/OpenCL/m05800.cl index a447bf79e..4569e1c41 100644 --- a/OpenCL/m05800.cl +++ b/OpenCL/m05800.cl @@ -2267,8 +2267,6 @@ __kernel void m05800_loop (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32 (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -2280,8 +2278,6 @@ __kernel void m05800_loop (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32 (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } u32 digest[5]; diff --git a/OpenCL/m06000_a0.cl b/OpenCL/m06000_a0.cl index 6f4c1d1a0..cc67d8593 100644 --- a/OpenCL/m06000_a0.cl +++ b/OpenCL/m06000_a0.cl @@ -39,8 +39,6 @@ __kernel void m06000_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -104,8 +102,6 @@ __kernel void m06000_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m06000_a3.cl b/OpenCL/m06000_a3.cl index e3b171d63..62f84f02a 100644 --- a/OpenCL/m06000_a3.cl +++ b/OpenCL/m06000_a3.cl @@ -37,8 +37,6 @@ __kernel void m06000_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -108,8 +106,6 @@ __kernel void m06000_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m06100_a0.cl b/OpenCL/m06100_a0.cl index 48fcad611..5b798d0c5 100644 --- a/OpenCL/m06100_a0.cl +++ b/OpenCL/m06100_a0.cl @@ -70,8 +70,6 @@ __kernel void m06100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -166,8 +164,6 @@ __kernel void m06100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m06100_a3.cl b/OpenCL/m06100_a3.cl index da99bac26..a9bd2a358 100644 --- a/OpenCL/m06100_a3.cl +++ b/OpenCL/m06100_a3.cl @@ -68,8 +68,6 @@ __kernel void m06100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -170,8 +168,6 @@ __kernel void m06100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m06300.cl b/OpenCL/m06300.cl index dbe156e8d..0397d0362 100644 --- a/OpenCL/m06300.cl +++ b/OpenCL/m06300.cl @@ -38,8 +38,6 @@ __kernel void m06300_init (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -51,8 +49,6 @@ __kernel void m06300_init (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -146,8 +142,6 @@ __kernel void m06300_loop (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -159,8 +153,6 @@ __kernel void m06300_loop (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m07000_a0.cl b/OpenCL/m07000_a0.cl index 40201efbf..412a929ef 100644 --- a/OpenCL/m07000_a0.cl +++ b/OpenCL/m07000_a0.cl @@ -39,8 +39,6 @@ __kernel void m07000_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; @@ -137,8 +135,6 @@ __kernel void m07000_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; diff --git a/OpenCL/m07000_a3.cl b/OpenCL/m07000_a3.cl index e80136832..6f5bf96ed 100644 --- a/OpenCL/m07000_a3.cl +++ b/OpenCL/m07000_a3.cl @@ -37,8 +37,6 @@ __kernel void m07000_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; @@ -142,8 +140,6 @@ __kernel void m07000_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; diff --git a/OpenCL/m07300_a0.cl b/OpenCL/m07300_a0.cl index 1b78ea340..596db3a54 100644 --- a/OpenCL/m07300_a0.cl +++ b/OpenCL/m07300_a0.cl @@ -39,8 +39,6 @@ __kernel void m07300_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -104,8 +102,6 @@ __kernel void m07300_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m07300_a1.cl b/OpenCL/m07300_a1.cl index 3e8e63a71..775d68a0f 100644 --- a/OpenCL/m07300_a1.cl +++ b/OpenCL/m07300_a1.cl @@ -37,8 +37,6 @@ __kernel void m07300_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -122,8 +120,6 @@ __kernel void m07300_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m07300_a3.cl b/OpenCL/m07300_a3.cl index ff2f6f0b1..4f5532822 100644 --- a/OpenCL/m07300_a3.cl +++ b/OpenCL/m07300_a3.cl @@ -37,8 +37,6 @@ __kernel void m07300_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -108,8 +106,6 @@ __kernel void m07300_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m07400.cl b/OpenCL/m07400.cl index 936add9f0..9eda4d84a 100644 --- a/OpenCL/m07400.cl +++ b/OpenCL/m07400.cl @@ -36,8 +36,6 @@ __kernel void m07400_init (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } for (int idx = 0; idx < pw_lenv; idx++) @@ -54,8 +52,6 @@ __kernel void m07400_init (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = salt_bufs[salt_pos].salt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } for (int idx = 0; idx < salt_lenv; idx++) diff --git a/OpenCL/m07500_a0.cl b/OpenCL/m07500_a0.cl index e38afb3dd..72491071f 100644 --- a/OpenCL/m07500_a0.cl +++ b/OpenCL/m07500_a0.cl @@ -292,8 +292,6 @@ __kernel void m07500_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } __local RC4_KEY rc4_keys[64]; @@ -370,8 +368,6 @@ __kernel void m07500_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } __local RC4_KEY rc4_keys[64]; diff --git a/OpenCL/m07500_a3.cl b/OpenCL/m07500_a3.cl index 2afee5d50..9271e3924 100644 --- a/OpenCL/m07500_a3.cl +++ b/OpenCL/m07500_a3.cl @@ -302,8 +302,6 @@ __kernel void m07500_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } __local RC4_KEY rc4_keys[64]; @@ -398,8 +396,6 @@ __kernel void m07500_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } __local RC4_KEY rc4_keys[64]; diff --git a/OpenCL/m07900.cl b/OpenCL/m07900.cl index d50b1d4c2..eaca08e00 100644 --- a/OpenCL/m07900.cl +++ b/OpenCL/m07900.cl @@ -66,8 +66,6 @@ __kernel void m07900_loop (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } for (int idx = 0; idx < pw_lenv; idx++) diff --git a/OpenCL/m08100_a0.cl b/OpenCL/m08100_a0.cl index 8d087d790..3f3839efa 100644 --- a/OpenCL/m08100_a0.cl +++ b/OpenCL/m08100_a0.cl @@ -39,8 +39,6 @@ __kernel void m08100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; @@ -108,8 +106,6 @@ __kernel void m08100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; diff --git a/OpenCL/m08100_a3.cl b/OpenCL/m08100_a3.cl index 59f0b281a..746fd240b 100644 --- a/OpenCL/m08100_a3.cl +++ b/OpenCL/m08100_a3.cl @@ -37,8 +37,6 @@ __kernel void m08100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; @@ -114,8 +112,6 @@ __kernel void m08100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; diff --git a/OpenCL/m08300_a0.cl b/OpenCL/m08300_a0.cl index 39cdf7d2a..906050bef 100644 --- a/OpenCL/m08300_a0.cl +++ b/OpenCL/m08300_a0.cl @@ -39,8 +39,6 @@ __kernel void m08300_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -52,8 +50,6 @@ __kernel void m08300_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len_pc = salt_bufs[salt_pos].salt_len_pc; @@ -65,8 +61,6 @@ __kernel void m08300_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_len_pcv; idx++) { s_pc[idx] = swap32_S (salt_bufs[salt_pos].salt_buf_pc[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_iter = salt_bufs[salt_pos].salt_iter; @@ -175,8 +169,6 @@ __kernel void m08300_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -188,8 +180,6 @@ __kernel void m08300_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len_pc = salt_bufs[salt_pos].salt_len_pc; @@ -201,8 +191,6 @@ __kernel void m08300_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_len_pcv; idx++) { s_pc[idx] = swap32_S (salt_bufs[salt_pos].salt_buf_pc[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_iter = salt_bufs[salt_pos].salt_iter; diff --git a/OpenCL/m08300_a1.cl b/OpenCL/m08300_a1.cl index 57904d8d5..c5fd82a79 100644 --- a/OpenCL/m08300_a1.cl +++ b/OpenCL/m08300_a1.cl @@ -37,8 +37,6 @@ __kernel void m08300_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len_pc = salt_bufs[salt_pos].salt_len_pc; @@ -50,8 +48,6 @@ __kernel void m08300_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_len_pcv; idx++) { s_pc[idx] = swap32_S (salt_bufs[salt_pos].salt_buf_pc[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_iter = salt_bufs[salt_pos].salt_iter; @@ -160,8 +156,6 @@ __kernel void m08300_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32_S (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len_pc = salt_bufs[salt_pos].salt_len_pc; @@ -173,8 +167,6 @@ __kernel void m08300_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_len_pcv; idx++) { s_pc[idx] = swap32_S (salt_bufs[salt_pos].salt_buf_pc[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_iter = salt_bufs[salt_pos].salt_iter; diff --git a/OpenCL/m08300_a3.cl b/OpenCL/m08300_a3.cl index 0fc7148f0..c15333c33 100644 --- a/OpenCL/m08300_a3.cl +++ b/OpenCL/m08300_a3.cl @@ -37,8 +37,6 @@ __kernel void m08300_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -50,8 +48,6 @@ __kernel void m08300_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32 (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len_pc = salt_bufs[salt_pos].salt_len_pc; @@ -63,8 +59,6 @@ __kernel void m08300_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_len_pcv; idx++) { s_pc[idx] = swap32 (salt_bufs[salt_pos].salt_buf_pc[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_iter = salt_bufs[salt_pos].salt_iter; @@ -179,8 +173,6 @@ __kernel void m08300_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -192,8 +184,6 @@ __kernel void m08300_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_lenv; idx++) { s[idx] = swap32 (salt_bufs[salt_pos].salt_buf[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_len_pc = salt_bufs[salt_pos].salt_len_pc; @@ -205,8 +195,6 @@ __kernel void m08300_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < salt_len_pcv; idx++) { s_pc[idx] = swap32 (salt_bufs[salt_pos].salt_buf_pc[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 salt_iter = salt_bufs[salt_pos].salt_iter; diff --git a/OpenCL/m08400_a0.cl b/OpenCL/m08400_a0.cl index 021a82f91..123887eb7 100644 --- a/OpenCL/m08400_a0.cl +++ b/OpenCL/m08400_a0.cl @@ -69,8 +69,6 @@ __kernel void m08400_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; @@ -241,8 +239,6 @@ __kernel void m08400_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; diff --git a/OpenCL/m08400_a3.cl b/OpenCL/m08400_a3.cl index 769795b39..541cd096f 100644 --- a/OpenCL/m08400_a3.cl +++ b/OpenCL/m08400_a3.cl @@ -67,8 +67,6 @@ __kernel void m08400_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; @@ -249,8 +247,6 @@ __kernel void m08400_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; diff --git a/OpenCL/m08900.cl b/OpenCL/m08900.cl index 9d65e1a71..4d3dd3913 100644 --- a/OpenCL/m08900.cl +++ b/OpenCL/m08900.cl @@ -265,8 +265,6 @@ __kernel void m08900_init (__global pw_t *pws, __global const kernel_rule_t *rul const uint4 tmp0 = (uint4) (digest[0], digest[1], digest[2], digest[3]); const uint4 tmp1 = (uint4) (digest[4], digest[5], digest[6], digest[7]); - barrier (CLK_GLOBAL_MEM_FENCE); - tmps[gid].P[k + 0] = tmp0; tmps[gid].P[k + 1] = tmp1; } @@ -331,8 +329,6 @@ __kernel void m08900_comp (__global pw_t *pws, __global const kernel_rule_t *rul for (u32 l = 0; l < SCRYPT_CNT4; l += 4) { - barrier (CLK_GLOBAL_MEM_FENCE); - uint4 tmp; tmp = tmps[gid].P[l + 0]; diff --git a/OpenCL/m09900_a0.cl b/OpenCL/m09900_a0.cl index ff06f126e..7e7459d11 100644 --- a/OpenCL/m09900_a0.cl +++ b/OpenCL/m09900_a0.cl @@ -39,8 +39,6 @@ __kernel void m09900_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -104,8 +102,6 @@ __kernel void m09900_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m09900_a3.cl b/OpenCL/m09900_a3.cl index 43c464e78..28b1ebe9f 100644 --- a/OpenCL/m09900_a3.cl +++ b/OpenCL/m09900_a3.cl @@ -37,8 +37,6 @@ __kernel void m09900_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -108,8 +106,6 @@ __kernel void m09900_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m10700.cl b/OpenCL/m10700.cl index a341d3d6e..693691652 100644 --- a/OpenCL/m10700.cl +++ b/OpenCL/m10700.cl @@ -1202,8 +1202,6 @@ __kernel void m10700_loop (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m10800_a0.cl b/OpenCL/m10800_a0.cl index 5e0780118..a84376f81 100644 --- a/OpenCL/m10800_a0.cl +++ b/OpenCL/m10800_a0.cl @@ -39,8 +39,6 @@ __kernel void m10800_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -104,8 +102,6 @@ __kernel void m10800_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = swap32_S (pws[gid].i[idx]); - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m10800_a3.cl b/OpenCL/m10800_a3.cl index 72ba67211..adff74a7f 100644 --- a/OpenCL/m10800_a3.cl +++ b/OpenCL/m10800_a3.cl @@ -37,8 +37,6 @@ __kernel void m10800_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -108,8 +106,6 @@ __kernel void m10800_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m11000_a0.cl b/OpenCL/m11000_a0.cl index 6449f5692..cc7310547 100644 --- a/OpenCL/m11000_a0.cl +++ b/OpenCL/m11000_a0.cl @@ -39,8 +39,6 @@ __kernel void m11000_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -108,8 +106,6 @@ __kernel void m11000_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m11000_a3.cl b/OpenCL/m11000_a3.cl index 5c4788266..b6e7b5f32 100644 --- a/OpenCL/m11000_a3.cl +++ b/OpenCL/m11000_a3.cl @@ -37,8 +37,6 @@ __kernel void m11000_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -114,8 +112,6 @@ __kernel void m11000_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m11100_a0.cl b/OpenCL/m11100_a0.cl index 2b73633d9..e2b372639 100644 --- a/OpenCL/m11100_a0.cl +++ b/OpenCL/m11100_a0.cl @@ -91,8 +91,6 @@ __kernel void m11100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -258,8 +256,6 @@ __kernel void m11100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m11100_a3.cl b/OpenCL/m11100_a3.cl index 492d4b9af..48216deed 100644 --- a/OpenCL/m11100_a3.cl +++ b/OpenCL/m11100_a3.cl @@ -89,8 +89,6 @@ __kernel void m11100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -292,8 +290,6 @@ __kernel void m11100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m11200_a0.cl b/OpenCL/m11200_a0.cl index c416f9b6a..1ba45efe8 100644 --- a/OpenCL/m11200_a0.cl +++ b/OpenCL/m11200_a0.cl @@ -39,8 +39,6 @@ __kernel void m11200_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; @@ -176,8 +174,6 @@ __kernel void m11200_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; diff --git a/OpenCL/m11200_a3.cl b/OpenCL/m11200_a3.cl index d162f3690..a76f4d508 100644 --- a/OpenCL/m11200_a3.cl +++ b/OpenCL/m11200_a3.cl @@ -49,8 +49,6 @@ __kernel void m11200_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; @@ -194,8 +192,6 @@ __kernel void m11200_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } sha1_ctx_t ctx0; diff --git a/OpenCL/m11400_a0.cl b/OpenCL/m11400_a0.cl index f75b2e104..72541c78c 100644 --- a/OpenCL/m11400_a0.cl +++ b/OpenCL/m11400_a0.cl @@ -69,8 +69,6 @@ __kernel void m11400_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -188,8 +186,6 @@ __kernel void m11400_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m11400_a3.cl b/OpenCL/m11400_a3.cl index ec568abd5..cace5a474 100644 --- a/OpenCL/m11400_a3.cl +++ b/OpenCL/m11400_a3.cl @@ -67,8 +67,6 @@ __kernel void m11400_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; @@ -80,8 +78,6 @@ __kernel void m11400_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < esalt_lenv; idx++) { esalt_buf[idx] = esalt_bufs[digests_offset].esalt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; @@ -207,8 +203,6 @@ __kernel void m11400_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } const u32 esalt_len = esalt_bufs[digests_offset].esalt_len; @@ -220,8 +214,6 @@ __kernel void m11400_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < esalt_lenv; idx++) { esalt_buf[idx] = esalt_bufs[digests_offset].esalt_buf[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } md5_ctx_t ctx0; diff --git a/OpenCL/m11600.cl b/OpenCL/m11600.cl index 44676b478..34b00d893 100644 --- a/OpenCL/m11600.cl +++ b/OpenCL/m11600.cl @@ -177,8 +177,6 @@ __kernel void m11600_loop (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m12400.cl b/OpenCL/m12400.cl index 7ac2b0f0f..1f56c3b4c 100644 --- a/OpenCL/m12400.cl +++ b/OpenCL/m12400.cl @@ -544,8 +544,6 @@ __kernel void m12400_init (__global pw_t *pws, __global const kernel_rule_t *rul for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } u32 tt; diff --git a/OpenCL/m12600_a0.cl b/OpenCL/m12600_a0.cl index 79da6bd9d..22ad327fb 100644 --- a/OpenCL/m12600_a0.cl +++ b/OpenCL/m12600_a0.cl @@ -85,8 +85,6 @@ __kernel void m12600_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -243,8 +241,6 @@ __kernel void m12600_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m12600_a3.cl b/OpenCL/m12600_a3.cl index be4ba4eb2..75dde746a 100644 --- a/OpenCL/m12600_a3.cl +++ b/OpenCL/m12600_a3.cl @@ -83,8 +83,6 @@ __kernel void m12600_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -246,8 +244,6 @@ __kernel void m12600_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m13100_a0.cl b/OpenCL/m13100_a0.cl index d01619666..a3672cff3 100644 --- a/OpenCL/m13100_a0.cl +++ b/OpenCL/m13100_a0.cl @@ -401,8 +401,6 @@ __kernel void m13100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } __local RC4_KEY rc4_keys[64]; @@ -470,8 +468,6 @@ __kernel void m13100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } __local RC4_KEY rc4_keys[64]; diff --git a/OpenCL/m13100_a3.cl b/OpenCL/m13100_a3.cl index 7b1ac5a05..3b7b8b742 100644 --- a/OpenCL/m13100_a3.cl +++ b/OpenCL/m13100_a3.cl @@ -399,8 +399,6 @@ __kernel void m13100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } __local RC4_KEY rc4_keys[64]; @@ -474,8 +472,6 @@ __kernel void m13100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } __local RC4_KEY rc4_keys[64]; diff --git a/OpenCL/m13300_a0.cl b/OpenCL/m13300_a0.cl index 0c3a005b0..a8e85f335 100644 --- a/OpenCL/m13300_a0.cl +++ b/OpenCL/m13300_a0.cl @@ -39,8 +39,6 @@ __kernel void m13300_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -106,8 +104,6 @@ __kernel void m13300_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m13300_a3.cl b/OpenCL/m13300_a3.cl index 8b69186a2..2ba67e412 100644 --- a/OpenCL/m13300_a3.cl +++ b/OpenCL/m13300_a3.cl @@ -37,8 +37,6 @@ __kernel void m13300_mxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** @@ -110,8 +108,6 @@ __kernel void m13300_sxx (__global pw_t *pws, __global const kernel_rule_t *rule for (int idx = 0; idx < pw_lenv; idx++) { w[idx] = pws[gid].i[idx]; - - barrier (CLK_GLOBAL_MEM_FENCE); } /** diff --git a/OpenCL/m15700.cl b/OpenCL/m15700.cl index a1971093f..8f5de92a9 100644 --- a/OpenCL/m15700.cl +++ b/OpenCL/m15700.cl @@ -396,8 +396,6 @@ __kernel void m15700_init (__global pw_t *pws, __global const kernel_rule_t *rul const uint4 tmp0 = (uint4) (digest[0], digest[1], digest[2], digest[3]); const uint4 tmp1 = (uint4) (digest[4], digest[5], digest[6], digest[7]); - barrier (CLK_GLOBAL_MEM_FENCE); - tmps[gid].P[k + 0] = tmp0; tmps[gid].P[k + 1] = tmp1; } @@ -462,8 +460,6 @@ __kernel void m15700_comp (__global pw_t *pws, __global const kernel_rule_t *rul for (u32 l = 0; l < SCRYPT_CNT4; l += 4) { - barrier (CLK_GLOBAL_MEM_FENCE); - uint4 tmp; tmp = tmps[gid].P[l + 0]; From 24a2fb01aaf7f761a66ac58ff60809e6566e0ff4 Mon Sep 17 00:00:00 2001 From: jsteube Date: Mon, 7 Aug 2017 18:58:23 +0200 Subject: [PATCH 73/75] Fix missing barrier in -m 8500 --- OpenCL/m08500_a0.cl | 2 ++ OpenCL/m08500_a1.cl | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/OpenCL/m08500_a0.cl b/OpenCL/m08500_a0.cl index e81915163..2a1d7a98a 100644 --- a/OpenCL/m08500_a0.cl +++ b/OpenCL/m08500_a0.cl @@ -559,6 +559,8 @@ __kernel void m08500_mxx (__global pw_t *pws, __global const kernel_rule_t *rule s_skb[7][i] = c_skb[7][i]; } + barrier (CLK_LOCAL_MEM_FENCE); + if (gid >= gid_max) return; /** diff --git a/OpenCL/m08500_a1.cl b/OpenCL/m08500_a1.cl index ade03fca5..50d221f52 100644 --- a/OpenCL/m08500_a1.cl +++ b/OpenCL/m08500_a1.cl @@ -557,6 +557,8 @@ __kernel void m08500_mxx (__global pw_t *pws, __global const kernel_rule_t *rule s_skb[7][i] = c_skb[7][i]; } + barrier (CLK_LOCAL_MEM_FENCE); + if (gid >= gid_max) return; /** @@ -711,6 +713,8 @@ __kernel void m08500_sxx (__global pw_t *pws, __global const kernel_rule_t *rule s_skb[7][i] = c_skb[7][i]; } + barrier (CLK_LOCAL_MEM_FENCE); + if (gid >= gid_max) return; /** From a5c0aa604113b9b60965ca06fc2d323f3a3bf574 Mon Sep 17 00:00:00 2001 From: jsteube Date: Tue, 8 Aug 2017 11:45:22 +0200 Subject: [PATCH 74/75] Add pure kernels for PeopleSoft PS_TOKEN --- OpenCL/m13500_a0.cl | 192 +++++++++++++++++++++++++++++++++++++++++ OpenCL/m13500_a1.cl | 168 ++++++++++++++++++++++++++++++++++++ OpenCL/m13500_a3.cl | 206 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 566 insertions(+) create mode 100644 OpenCL/m13500_a0.cl create mode 100644 OpenCL/m13500_a1.cl create mode 100644 OpenCL/m13500_a3.cl diff --git a/OpenCL/m13500_a0.cl b/OpenCL/m13500_a0.cl new file mode 100644 index 000000000..6e8befd86 --- /dev/null +++ b/OpenCL/m13500_a0.cl @@ -0,0 +1,192 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m13500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const pstoken_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * salt + */ + + const u32 pc_offset = esalt_bufs[digests_offset].pc_offset; + + sha1_ctx_t ctx0; + + ctx0.h[0] = esalt_bufs[digests_offset].pc_digest[0]; + ctx0.h[1] = esalt_bufs[digests_offset].pc_digest[1]; + ctx0.h[2] = esalt_bufs[digests_offset].pc_digest[2]; + ctx0.h[3] = esalt_bufs[digests_offset].pc_digest[3]; + ctx0.h[4] = esalt_bufs[digests_offset].pc_digest[4]; + + ctx0.w0[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 0]); + ctx0.w0[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 1]); + ctx0.w0[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 2]); + ctx0.w0[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 3]); + ctx0.w1[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 4]); + ctx0.w1[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 5]); + ctx0.w1[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 6]); + ctx0.w1[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 7]); + ctx0.w2[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 8]); + ctx0.w2[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 9]); + ctx0.w2[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 10]); + ctx0.w2[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 11]); + ctx0.w3[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 12]); + ctx0.w3[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 13]); + ctx0.w3[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 14]); + ctx0.w3[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 15]); + + ctx0.len = esalt_bufs[digests_offset].salt_len; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx = ctx0; + + sha1_update_utf16le_swap (&ctx, w, pw_len); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m13500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const pstoken_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * salt + */ + + const u32 pc_offset = esalt_bufs[digests_offset].pc_offset; + + sha1_ctx_t ctx0; + + ctx0.h[0] = esalt_bufs[digests_offset].pc_digest[0]; + ctx0.h[1] = esalt_bufs[digests_offset].pc_digest[1]; + ctx0.h[2] = esalt_bufs[digests_offset].pc_digest[2]; + ctx0.h[3] = esalt_bufs[digests_offset].pc_digest[3]; + ctx0.h[4] = esalt_bufs[digests_offset].pc_digest[4]; + + ctx0.w0[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 0]); + ctx0.w0[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 1]); + ctx0.w0[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 2]); + ctx0.w0[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 3]); + ctx0.w1[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 4]); + ctx0.w1[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 5]); + ctx0.w1[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 6]); + ctx0.w1[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 7]); + ctx0.w2[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 8]); + ctx0.w2[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 9]); + ctx0.w2[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 10]); + ctx0.w2[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 11]); + ctx0.w3[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 12]); + ctx0.w3[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 13]); + ctx0.w3[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 14]); + ctx0.w3[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 15]); + + ctx0.len = esalt_bufs[digests_offset].salt_len; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha1_ctx_t ctx = ctx0; + + sha1_update_utf16le_swap (&ctx, w, pw_len); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m13500_a1.cl b/OpenCL/m13500_a1.cl new file mode 100644 index 000000000..5cb45fda6 --- /dev/null +++ b/OpenCL/m13500_a1.cl @@ -0,0 +1,168 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha1.cl" + +__kernel void m13500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const pstoken_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * salt + */ + + const u32 pc_offset = esalt_bufs[digests_offset].pc_offset; + + sha1_ctx_t ctx0; + + ctx0.h[0] = esalt_bufs[digests_offset].pc_digest[0]; + ctx0.h[1] = esalt_bufs[digests_offset].pc_digest[1]; + ctx0.h[2] = esalt_bufs[digests_offset].pc_digest[2]; + ctx0.h[3] = esalt_bufs[digests_offset].pc_digest[3]; + ctx0.h[4] = esalt_bufs[digests_offset].pc_digest[4]; + + ctx0.w0[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 0]); + ctx0.w0[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 1]); + ctx0.w0[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 2]); + ctx0.w0[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 3]); + ctx0.w1[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 4]); + ctx0.w1[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 5]); + ctx0.w1[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 6]); + ctx0.w1[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 7]); + ctx0.w2[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 8]); + ctx0.w2[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 9]); + ctx0.w2[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 10]); + ctx0.w2[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 11]); + ctx0.w3[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 12]); + ctx0.w3[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 13]); + ctx0.w3[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 14]); + ctx0.w3[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 15]); + + ctx0.len = esalt_bufs[digests_offset].salt_len; + + /** + * base + */ + + sha1_update_global_utf16le_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx = ctx0; + + sha1_update_global_utf16le_swap (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m13500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const pstoken_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * salt + */ + + const u32 pc_offset = esalt_bufs[digests_offset].pc_offset; + + sha1_ctx_t ctx0; + + ctx0.h[0] = esalt_bufs[digests_offset].pc_digest[0]; + ctx0.h[1] = esalt_bufs[digests_offset].pc_digest[1]; + ctx0.h[2] = esalt_bufs[digests_offset].pc_digest[2]; + ctx0.h[3] = esalt_bufs[digests_offset].pc_digest[3]; + ctx0.h[4] = esalt_bufs[digests_offset].pc_digest[4]; + + ctx0.w0[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 0]); + ctx0.w0[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 1]); + ctx0.w0[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 2]); + ctx0.w0[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 3]); + ctx0.w1[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 4]); + ctx0.w1[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 5]); + ctx0.w1[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 6]); + ctx0.w1[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 7]); + ctx0.w2[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 8]); + ctx0.w2[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 9]); + ctx0.w2[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 10]); + ctx0.w2[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 11]); + ctx0.w3[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 12]); + ctx0.w3[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 13]); + ctx0.w3[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 14]); + ctx0.w3[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 15]); + + ctx0.len = esalt_bufs[digests_offset].salt_len; + + /** + * base + */ + + sha1_update_global_utf16le_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha1_ctx_t ctx = ctx0; + + sha1_update_global_utf16le_swap (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha1_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m13500_a3.cl b/OpenCL/m13500_a3.cl new file mode 100644 index 000000000..18828963c --- /dev/null +++ b/OpenCL/m13500_a3.cl @@ -0,0 +1,206 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_sha1.cl" + +__kernel void m13500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const pstoken_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * salt + */ + + const u32 pc_offset = esalt_bufs[digests_offset].pc_offset; + + sha1_ctx_t ctx0; + + ctx0.h[0] = esalt_bufs[digests_offset].pc_digest[0]; + ctx0.h[1] = esalt_bufs[digests_offset].pc_digest[1]; + ctx0.h[2] = esalt_bufs[digests_offset].pc_digest[2]; + ctx0.h[3] = esalt_bufs[digests_offset].pc_digest[3]; + ctx0.h[4] = esalt_bufs[digests_offset].pc_digest[4]; + + ctx0.w0[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 0]); + ctx0.w0[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 1]); + ctx0.w0[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 2]); + ctx0.w0[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 3]); + ctx0.w1[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 4]); + ctx0.w1[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 5]); + ctx0.w1[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 6]); + ctx0.w1[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 7]); + ctx0.w2[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 8]); + ctx0.w2[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 9]); + ctx0.w2[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 10]); + ctx0.w2[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 11]); + ctx0.w3[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 12]); + ctx0.w3[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 13]); + ctx0.w3[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 14]); + ctx0.w3[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 15]); + + ctx0.len = esalt_bufs[digests_offset].salt_len; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx; + + sha1_init_vector_from_scalar (&ctx, &ctx0); + + sha1_update_vector_utf16beN (&ctx, w, pw_len); + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m13500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const pstoken_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * salt + */ + + const u32 pc_offset = esalt_bufs[digests_offset].pc_offset; + + sha1_ctx_t ctx0; + + ctx0.h[0] = esalt_bufs[digests_offset].pc_digest[0]; + ctx0.h[1] = esalt_bufs[digests_offset].pc_digest[1]; + ctx0.h[2] = esalt_bufs[digests_offset].pc_digest[2]; + ctx0.h[3] = esalt_bufs[digests_offset].pc_digest[3]; + ctx0.h[4] = esalt_bufs[digests_offset].pc_digest[4]; + + ctx0.w0[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 0]); + ctx0.w0[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 1]); + ctx0.w0[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 2]); + ctx0.w0[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 3]); + ctx0.w1[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 4]); + ctx0.w1[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 5]); + ctx0.w1[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 6]); + ctx0.w1[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 7]); + ctx0.w2[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 8]); + ctx0.w2[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 9]); + ctx0.w2[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 10]); + ctx0.w2[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 11]); + ctx0.w3[0] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 12]); + ctx0.w3[1] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 13]); + ctx0.w3[2] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 14]); + ctx0.w3[3] = swap32_S (esalt_bufs[digests_offset].salt_buf[pc_offset + 15]); + + ctx0.len = esalt_bufs[digests_offset].salt_len; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha1_ctx_vector_t ctx; + + sha1_init_vector_from_scalar (&ctx, &ctx0); + + sha1_update_vector_utf16beN (&ctx, w, pw_len); + + sha1_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} From 4443ecd8615eff39c6424e360f0316f6c98f2416 Mon Sep 17 00:00:00 2001 From: jsteube Date: Tue, 8 Aug 2017 15:26:34 +0200 Subject: [PATCH 75/75] Add pure kernels for Windows Phone 8+ PIN/password --- OpenCL/m13800_a0.cl | 134 +++++++++++++++++++++++++++++++++++ OpenCL/m13800_a1.cl | 114 ++++++++++++++++++++++++++++++ OpenCL/m13800_a3.cl | 166 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 414 insertions(+) create mode 100644 OpenCL/m13800_a0.cl create mode 100644 OpenCL/m13800_a1.cl create mode 100644 OpenCL/m13800_a3.cl diff --git a/OpenCL/m13800_a0.cl b/OpenCL/m13800_a0.cl new file mode 100644 index 000000000..f6dd8b14c --- /dev/null +++ b/OpenCL/m13800_a0.cl @@ -0,0 +1,134 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp.h" +#include "inc_rp.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha256.cl" + +__kernel void m13800_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const win8phone_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha256_ctx_t ctx; + + sha256_init (&ctx); + + sha256_update_utf16le_swap (&ctx, w, pw_len); + + sha256_update_global (&ctx, esalt_bufs[digests_offset].salt_buf, 128); + + sha256_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m13800_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const win8phone_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32 w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + // todo: add rules engine + + sha256_ctx_t ctx; + + sha256_init (&ctx); + + sha256_update_utf16le_swap (&ctx, w, pw_len); + + sha256_update_global (&ctx, esalt_bufs[digests_offset].salt_buf, 128); + + sha256_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m13800_a1.cl b/OpenCL/m13800_a1.cl new file mode 100644 index 000000000..6d618ad0b --- /dev/null +++ b/OpenCL/m13800_a1.cl @@ -0,0 +1,114 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +//#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_scalar.cl" +#include "inc_hash_sha256.cl" + +__kernel void m13800_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const win8phone_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + sha256_ctx_t ctx0; + + sha256_init (&ctx0); + + sha256_update_global_utf16le_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha256_ctx_t ctx = ctx0; + + sha256_update_global_utf16le_swap (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha256_update_global (&ctx, esalt_bufs[digests_offset].salt_buf, 128); + + sha256_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_M_SCALAR (r0, r1, r2, r3); + } +} + +__kernel void m13800_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const win8phone_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + sha256_ctx_t ctx0; + + sha256_init (&ctx0); + + sha256_update_global_utf16le_swap (&ctx0, pws[gid].i, pws[gid].pw_len); + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + { + sha256_ctx_t ctx = ctx0; + + sha256_update_global_utf16le_swap (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len); + + sha256_update_global (&ctx, esalt_bufs[digests_offset].salt_buf, 128); + + sha256_final (&ctx); + + const u32 r0 = ctx.h[DGST_R0]; + const u32 r1 = ctx.h[DGST_R1]; + const u32 r2 = ctx.h[DGST_R2]; + const u32 r3 = ctx.h[DGST_R3]; + + COMPARE_S_SCALAR (r0, r1, r2, r3); + } +} diff --git a/OpenCL/m13800_a3.cl b/OpenCL/m13800_a3.cl new file mode 100644 index 000000000..fcf68e292 --- /dev/null +++ b/OpenCL/m13800_a3.cl @@ -0,0 +1,166 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" +#include "inc_hash_sha256.cl" + +__kernel void m13800_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const win8phone_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + const u32 salt_len = 128; + + const u32 salt_lenv = 32; + + u32x s[32]; + + for (int idx = 0; idx < salt_lenv; idx++) + { + s[idx] = esalt_bufs[digests_offset].salt_buf[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha256_ctx_vector_t ctx; + + sha256_init_vector (&ctx); + + sha256_update_vector_utf16beN (&ctx, w, pw_len); + + sha256_update_vector (&ctx, s, salt_len); + + sha256_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m13800_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const u32x *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const win8phone_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +{ + /** + * modifier + */ + + const u32 lid = get_local_id (0); + const u32 gid = get_global_id (0); + + if (gid >= gid_max) return; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * base + */ + + const u32 pw_len = pws[gid].pw_len; + + const u32 pw_lenv = ceil ((float) pw_len / 4); + + u32x w[64] = { 0 }; + + for (int idx = 0; idx < pw_lenv; idx++) + { + w[idx] = pws[gid].i[idx]; + } + + const u32 salt_len = 128; + + const u32 salt_lenv = 32; + + u32x s[32]; + + for (int idx = 0; idx < salt_lenv; idx++) + { + s[idx] = esalt_bufs[digests_offset].salt_buf[idx]; + } + + /** + * loop + */ + + u32x w0l = w[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; + + const u32x w0 = w0l | w0r; + + w[0] = w0; + + sha256_ctx_vector_t ctx; + + sha256_init_vector (&ctx); + + sha256_update_vector_utf16beN (&ctx, w, pw_len); + + sha256_update_vector (&ctx, s, salt_len); + + sha256_final_vector (&ctx); + + const u32x r0 = ctx.h[DGST_R0]; + const u32x r1 = ctx.h[DGST_R1]; + const u32x r2 = ctx.h[DGST_R2]; + const u32x r3 = ctx.h[DGST_R3]; + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +}