Unroll some of the code in the candidate generators

pull/2387/head
Jens Steube 4 years ago
parent e077b5ded4
commit b6feddd81f

@ -34,11 +34,70 @@ KERNEL_FQ void amp (GLOBAL_AS pw_t *pws, GLOBAL_AS pw_t *pws_amp, GLOBAL_AS cons
switch_buffer_by_offset_1x64_le_S (pw.i, comb_len);
}
#pragma unroll
for (int i = 0; i < 64; i++)
{
pw.i[i] |= comb.i[i];
}
pw.i[ 0] |= comb.i[ 0];
pw.i[ 1] |= comb.i[ 1];
pw.i[ 2] |= comb.i[ 2];
pw.i[ 3] |= comb.i[ 3];
pw.i[ 4] |= comb.i[ 4];
pw.i[ 5] |= comb.i[ 5];
pw.i[ 6] |= comb.i[ 6];
pw.i[ 7] |= comb.i[ 7];
pw.i[ 8] |= comb.i[ 8];
pw.i[ 9] |= comb.i[ 9];
pw.i[10] |= comb.i[10];
pw.i[11] |= comb.i[11];
pw.i[12] |= comb.i[12];
pw.i[13] |= comb.i[13];
pw.i[14] |= comb.i[14];
pw.i[15] |= comb.i[15];
pw.i[16] |= comb.i[16];
pw.i[17] |= comb.i[17];
pw.i[18] |= comb.i[18];
pw.i[19] |= comb.i[19];
pw.i[20] |= comb.i[20];
pw.i[21] |= comb.i[21];
pw.i[22] |= comb.i[22];
pw.i[23] |= comb.i[23];
pw.i[24] |= comb.i[24];
pw.i[25] |= comb.i[25];
pw.i[26] |= comb.i[26];
pw.i[27] |= comb.i[27];
pw.i[28] |= comb.i[28];
pw.i[29] |= comb.i[29];
pw.i[30] |= comb.i[30];
pw.i[31] |= comb.i[31];
pw.i[32] |= comb.i[32];
pw.i[33] |= comb.i[33];
pw.i[34] |= comb.i[34];
pw.i[35] |= comb.i[35];
pw.i[36] |= comb.i[36];
pw.i[37] |= comb.i[37];
pw.i[38] |= comb.i[38];
pw.i[39] |= comb.i[39];
pw.i[40] |= comb.i[40];
pw.i[41] |= comb.i[41];
pw.i[42] |= comb.i[42];
pw.i[43] |= comb.i[43];
pw.i[44] |= comb.i[44];
pw.i[45] |= comb.i[45];
pw.i[46] |= comb.i[46];
pw.i[47] |= comb.i[47];
pw.i[48] |= comb.i[48];
pw.i[49] |= comb.i[49];
pw.i[50] |= comb.i[50];
pw.i[51] |= comb.i[51];
pw.i[52] |= comb.i[52];
pw.i[53] |= comb.i[53];
pw.i[54] |= comb.i[54];
pw.i[55] |= comb.i[55];
pw.i[56] |= comb.i[56];
pw.i[57] |= comb.i[57];
pw.i[58] |= comb.i[58];
pw.i[59] |= comb.i[59];
pw.i[60] |= comb.i[60];
pw.i[61] |= comb.i[61];
pw.i[62] |= comb.i[62];
pw.i[63] |= comb.i[63];
pw.pw_len = pw_len + comb_len;

@ -52,17 +52,79 @@ KERNEL_FQ void l_markov (GLOBAL_AS pw_t *pws_buf_l, GLOBAL_AS const cs_t *root_c
if (gid >= gid_max) return;
u32 pw_buf[64] = { 0 };
generate_pw (pw_buf, root_css_buf, markov_css_buf, pw_l_len, pw_r_len, mask80, bits14, bits15, off + gid);
pw_t pw;
#pragma unroll
for (int idx = 0; idx < 64; idx++)
{
pws_buf_l[gid].i[idx] = pw_buf[idx];
}
pw.i[ 0] = 0;
pw.i[ 1] = 0;
pw.i[ 2] = 0;
pw.i[ 3] = 0;
pw.i[ 4] = 0;
pw.i[ 5] = 0;
pw.i[ 6] = 0;
pw.i[ 7] = 0;
pw.i[ 8] = 0;
pw.i[ 9] = 0;
pw.i[10] = 0;
pw.i[11] = 0;
pw.i[12] = 0;
pw.i[13] = 0;
pw.i[14] = 0;
pw.i[15] = 0;
pw.i[16] = 0;
pw.i[17] = 0;
pw.i[18] = 0;
pw.i[19] = 0;
pw.i[20] = 0;
pw.i[21] = 0;
pw.i[22] = 0;
pw.i[23] = 0;
pw.i[24] = 0;
pw.i[25] = 0;
pw.i[26] = 0;
pw.i[27] = 0;
pw.i[28] = 0;
pw.i[29] = 0;
pw.i[30] = 0;
pw.i[31] = 0;
pw.i[32] = 0;
pw.i[33] = 0;
pw.i[34] = 0;
pw.i[35] = 0;
pw.i[36] = 0;
pw.i[37] = 0;
pw.i[38] = 0;
pw.i[39] = 0;
pw.i[40] = 0;
pw.i[41] = 0;
pw.i[42] = 0;
pw.i[43] = 0;
pw.i[44] = 0;
pw.i[45] = 0;
pw.i[46] = 0;
pw.i[47] = 0;
pw.i[48] = 0;
pw.i[49] = 0;
pw.i[50] = 0;
pw.i[51] = 0;
pw.i[52] = 0;
pw.i[53] = 0;
pw.i[54] = 0;
pw.i[55] = 0;
pw.i[56] = 0;
pw.i[57] = 0;
pw.i[58] = 0;
pw.i[59] = 0;
pw.i[60] = 0;
pw.i[61] = 0;
pw.i[62] = 0;
pw.i[63] = 0;
pw.pw_len = pw_l_len + pw_r_len;
generate_pw (pw.i, root_css_buf, markov_css_buf, pw_l_len, pw_r_len, mask80, bits14, bits15, off + gid);
pws_buf_l[gid].pw_len = pw_l_len + pw_r_len;
pws_buf_l[gid] = pw;
}
KERNEL_FQ void r_markov (GLOBAL_AS bf_t *pws_buf_r, GLOBAL_AS const cs_t *root_css_buf, GLOBAL_AS const cs_t *markov_css_buf, const u64 off, const u32 pw_r_len, const u32 mask80, const u32 bits14, const u32 bits15, const u64 gid_max)
@ -71,11 +133,76 @@ KERNEL_FQ void r_markov (GLOBAL_AS bf_t *pws_buf_r, GLOBAL_AS const cs_t *root_c
if (gid >= gid_max) return;
u32 pw_buf[64] = { 0 };
pw_t pw;
generate_pw (pw_buf, root_css_buf, markov_css_buf, pw_r_len, 0, 0, 0, 0, off + gid);
pw.i[ 0] = 0;
pw.i[ 1] = 0;
pw.i[ 2] = 0;
pw.i[ 3] = 0;
pw.i[ 4] = 0;
pw.i[ 5] = 0;
pw.i[ 6] = 0;
pw.i[ 7] = 0;
pw.i[ 8] = 0;
pw.i[ 9] = 0;
pw.i[10] = 0;
pw.i[11] = 0;
pw.i[12] = 0;
pw.i[13] = 0;
pw.i[14] = 0;
pw.i[15] = 0;
pw.i[16] = 0;
pw.i[17] = 0;
pw.i[18] = 0;
pw.i[19] = 0;
pw.i[20] = 0;
pw.i[21] = 0;
pw.i[22] = 0;
pw.i[23] = 0;
pw.i[24] = 0;
pw.i[25] = 0;
pw.i[26] = 0;
pw.i[27] = 0;
pw.i[28] = 0;
pw.i[29] = 0;
pw.i[30] = 0;
pw.i[31] = 0;
pw.i[32] = 0;
pw.i[33] = 0;
pw.i[34] = 0;
pw.i[35] = 0;
pw.i[36] = 0;
pw.i[37] = 0;
pw.i[38] = 0;
pw.i[39] = 0;
pw.i[40] = 0;
pw.i[41] = 0;
pw.i[42] = 0;
pw.i[43] = 0;
pw.i[44] = 0;
pw.i[45] = 0;
pw.i[46] = 0;
pw.i[47] = 0;
pw.i[48] = 0;
pw.i[49] = 0;
pw.i[50] = 0;
pw.i[51] = 0;
pw.i[52] = 0;
pw.i[53] = 0;
pw.i[54] = 0;
pw.i[55] = 0;
pw.i[56] = 0;
pw.i[57] = 0;
pw.i[58] = 0;
pw.i[59] = 0;
pw.i[60] = 0;
pw.i[61] = 0;
pw.i[62] = 0;
pw.i[63] = 0;
pws_buf_r[gid].i = pw_buf[0];
generate_pw (pw.i, root_css_buf, markov_css_buf, pw_r_len, 0, 0, 0, 0, off + gid);
pws_buf_r[gid].i = pw.i[0];
}
KERNEL_FQ void C_markov (GLOBAL_AS pw_t *pws_buf, GLOBAL_AS const cs_t *root_css_buf, GLOBAL_AS const cs_t *markov_css_buf, const u64 off, const u32 pw_len, const u32 mask80, const u32 bits14, const u32 bits15, const u64 gid_max)
@ -84,15 +211,76 @@ KERNEL_FQ void C_markov (GLOBAL_AS pw_t *pws_buf, GLOBAL_AS const cs_t *root_css
if (gid >= gid_max) return;
u32 pw_buf[64] = { 0 };
pw_t pw;
generate_pw (pw_buf, root_css_buf, markov_css_buf, pw_len, 0, mask80, bits14, bits15, off + gid);
pw.i[ 0] = 0;
pw.i[ 1] = 0;
pw.i[ 2] = 0;
pw.i[ 3] = 0;
pw.i[ 4] = 0;
pw.i[ 5] = 0;
pw.i[ 6] = 0;
pw.i[ 7] = 0;
pw.i[ 8] = 0;
pw.i[ 9] = 0;
pw.i[10] = 0;
pw.i[11] = 0;
pw.i[12] = 0;
pw.i[13] = 0;
pw.i[14] = 0;
pw.i[15] = 0;
pw.i[16] = 0;
pw.i[17] = 0;
pw.i[18] = 0;
pw.i[19] = 0;
pw.i[20] = 0;
pw.i[21] = 0;
pw.i[22] = 0;
pw.i[23] = 0;
pw.i[24] = 0;
pw.i[25] = 0;
pw.i[26] = 0;
pw.i[27] = 0;
pw.i[28] = 0;
pw.i[29] = 0;
pw.i[30] = 0;
pw.i[31] = 0;
pw.i[32] = 0;
pw.i[33] = 0;
pw.i[34] = 0;
pw.i[35] = 0;
pw.i[36] = 0;
pw.i[37] = 0;
pw.i[38] = 0;
pw.i[39] = 0;
pw.i[40] = 0;
pw.i[41] = 0;
pw.i[42] = 0;
pw.i[43] = 0;
pw.i[44] = 0;
pw.i[45] = 0;
pw.i[46] = 0;
pw.i[47] = 0;
pw.i[48] = 0;
pw.i[49] = 0;
pw.i[50] = 0;
pw.i[51] = 0;
pw.i[52] = 0;
pw.i[53] = 0;
pw.i[54] = 0;
pw.i[55] = 0;
pw.i[56] = 0;
pw.i[57] = 0;
pw.i[58] = 0;
pw.i[59] = 0;
pw.i[60] = 0;
pw.i[61] = 0;
pw.i[62] = 0;
pw.i[63] = 0;
#pragma unroll
for (int idx = 0; idx < 64; idx++)
{
pws_buf[gid].i[idx] = pw_buf[idx];
}
pw.pw_len = pw_len;
generate_pw (pw.i, root_css_buf, markov_css_buf, pw_len, 0, mask80, bits14, bits15, off + gid);
pws_buf[gid].pw_len = pw_len;
pws_buf[gid] = pw;
}

@ -52,17 +52,78 @@ KERNEL_FQ void l_markov (GLOBAL_AS pw_t *pws_buf_l, GLOBAL_AS const cs_t *root_c
if (gid >= gid_max) return;
u32 pw_buf[64] = { 0 };
pw_t pw;
generate_pw (pw_buf, root_css_buf, markov_css_buf, pw_l_len, pw_r_len, mask80, bits14, bits15, off + gid);
pw.i[ 0] = 0;
pw.i[ 1] = 0;
pw.i[ 2] = 0;
pw.i[ 3] = 0;
pw.i[ 4] = 0;
pw.i[ 5] = 0;
pw.i[ 6] = 0;
pw.i[ 7] = 0;
pw.i[ 8] = 0;
pw.i[ 9] = 0;
pw.i[10] = 0;
pw.i[11] = 0;
pw.i[12] = 0;
pw.i[13] = 0;
pw.i[14] = 0;
pw.i[15] = 0;
pw.i[16] = 0;
pw.i[17] = 0;
pw.i[18] = 0;
pw.i[19] = 0;
pw.i[20] = 0;
pw.i[21] = 0;
pw.i[22] = 0;
pw.i[23] = 0;
pw.i[24] = 0;
pw.i[25] = 0;
pw.i[26] = 0;
pw.i[27] = 0;
pw.i[28] = 0;
pw.i[29] = 0;
pw.i[30] = 0;
pw.i[31] = 0;
pw.i[32] = 0;
pw.i[33] = 0;
pw.i[34] = 0;
pw.i[35] = 0;
pw.i[36] = 0;
pw.i[37] = 0;
pw.i[38] = 0;
pw.i[39] = 0;
pw.i[40] = 0;
pw.i[41] = 0;
pw.i[42] = 0;
pw.i[43] = 0;
pw.i[44] = 0;
pw.i[45] = 0;
pw.i[46] = 0;
pw.i[47] = 0;
pw.i[48] = 0;
pw.i[49] = 0;
pw.i[50] = 0;
pw.i[51] = 0;
pw.i[52] = 0;
pw.i[53] = 0;
pw.i[54] = 0;
pw.i[55] = 0;
pw.i[56] = 0;
pw.i[57] = 0;
pw.i[58] = 0;
pw.i[59] = 0;
pw.i[60] = 0;
pw.i[61] = 0;
pw.i[62] = 0;
pw.i[63] = 0;
#pragma unroll
for (int idx = 0; idx < 64; idx++)
{
pws_buf_l[gid].i[idx] = pw_buf[idx];
}
pw.pw_len = pw_l_len + pw_r_len;
pws_buf_l[gid].pw_len = pw_l_len + pw_r_len;
generate_pw (pw.i, root_css_buf, markov_css_buf, pw_l_len, pw_r_len, mask80, bits14, bits15, off + gid);
pws_buf_l[gid] = pw;
}
KERNEL_FQ void r_markov (GLOBAL_AS bf_t *pws_buf_r, GLOBAL_AS const cs_t *root_css_buf, GLOBAL_AS const cs_t *markov_css_buf, const u64 off, const u32 pw_r_len, const u32 mask80, const u32 bits14, const u32 bits15, const u64 gid_max)
@ -71,11 +132,76 @@ KERNEL_FQ void r_markov (GLOBAL_AS bf_t *pws_buf_r, GLOBAL_AS const cs_t *root_c
if (gid >= gid_max) return;
u32 pw_buf[64] = { 0 };
pw_t pw;
pw.i[ 0] = 0;
pw.i[ 1] = 0;
pw.i[ 2] = 0;
pw.i[ 3] = 0;
pw.i[ 4] = 0;
pw.i[ 5] = 0;
pw.i[ 6] = 0;
pw.i[ 7] = 0;
pw.i[ 8] = 0;
pw.i[ 9] = 0;
pw.i[10] = 0;
pw.i[11] = 0;
pw.i[12] = 0;
pw.i[13] = 0;
pw.i[14] = 0;
pw.i[15] = 0;
pw.i[16] = 0;
pw.i[17] = 0;
pw.i[18] = 0;
pw.i[19] = 0;
pw.i[20] = 0;
pw.i[21] = 0;
pw.i[22] = 0;
pw.i[23] = 0;
pw.i[24] = 0;
pw.i[25] = 0;
pw.i[26] = 0;
pw.i[27] = 0;
pw.i[28] = 0;
pw.i[29] = 0;
pw.i[30] = 0;
pw.i[31] = 0;
pw.i[32] = 0;
pw.i[33] = 0;
pw.i[34] = 0;
pw.i[35] = 0;
pw.i[36] = 0;
pw.i[37] = 0;
pw.i[38] = 0;
pw.i[39] = 0;
pw.i[40] = 0;
pw.i[41] = 0;
pw.i[42] = 0;
pw.i[43] = 0;
pw.i[44] = 0;
pw.i[45] = 0;
pw.i[46] = 0;
pw.i[47] = 0;
pw.i[48] = 0;
pw.i[49] = 0;
pw.i[50] = 0;
pw.i[51] = 0;
pw.i[52] = 0;
pw.i[53] = 0;
pw.i[54] = 0;
pw.i[55] = 0;
pw.i[56] = 0;
pw.i[57] = 0;
pw.i[58] = 0;
pw.i[59] = 0;
pw.i[60] = 0;
pw.i[61] = 0;
pw.i[62] = 0;
pw.i[63] = 0;
generate_pw (pw_buf, root_css_buf, markov_css_buf, pw_r_len, 0, 0, 0, 0, off + gid);
generate_pw (pw.i, root_css_buf, markov_css_buf, pw_r_len, 0, 0, 0, 0, off + gid);
pws_buf_r[gid].i = pw_buf[0];
pws_buf_r[gid].i = pw.i[0];
}
KERNEL_FQ void C_markov (GLOBAL_AS pw_t *pws_buf, GLOBAL_AS const cs_t *root_css_buf, GLOBAL_AS const cs_t *markov_css_buf, const u64 off, const u32 pw_len, const u32 mask80, const u32 bits14, const u32 bits15, const u64 gid_max)
@ -84,15 +210,76 @@ KERNEL_FQ void C_markov (GLOBAL_AS pw_t *pws_buf, GLOBAL_AS const cs_t *root_css
if (gid >= gid_max) return;
u32 pw_buf[64] = { 0 };
pw_t pw;
generate_pw (pw_buf, root_css_buf, markov_css_buf, pw_len, 0, mask80, bits14, bits15, off + gid);
pw.i[ 0] = 0;
pw.i[ 1] = 0;
pw.i[ 2] = 0;
pw.i[ 3] = 0;
pw.i[ 4] = 0;
pw.i[ 5] = 0;
pw.i[ 6] = 0;
pw.i[ 7] = 0;
pw.i[ 8] = 0;
pw.i[ 9] = 0;
pw.i[10] = 0;
pw.i[11] = 0;
pw.i[12] = 0;
pw.i[13] = 0;
pw.i[14] = 0;
pw.i[15] = 0;
pw.i[16] = 0;
pw.i[17] = 0;
pw.i[18] = 0;
pw.i[19] = 0;
pw.i[20] = 0;
pw.i[21] = 0;
pw.i[22] = 0;
pw.i[23] = 0;
pw.i[24] = 0;
pw.i[25] = 0;
pw.i[26] = 0;
pw.i[27] = 0;
pw.i[28] = 0;
pw.i[29] = 0;
pw.i[30] = 0;
pw.i[31] = 0;
pw.i[32] = 0;
pw.i[33] = 0;
pw.i[34] = 0;
pw.i[35] = 0;
pw.i[36] = 0;
pw.i[37] = 0;
pw.i[38] = 0;
pw.i[39] = 0;
pw.i[40] = 0;
pw.i[41] = 0;
pw.i[42] = 0;
pw.i[43] = 0;
pw.i[44] = 0;
pw.i[45] = 0;
pw.i[46] = 0;
pw.i[47] = 0;
pw.i[48] = 0;
pw.i[49] = 0;
pw.i[50] = 0;
pw.i[51] = 0;
pw.i[52] = 0;
pw.i[53] = 0;
pw.i[54] = 0;
pw.i[55] = 0;
pw.i[56] = 0;
pw.i[57] = 0;
pw.i[58] = 0;
pw.i[59] = 0;
pw.i[60] = 0;
pw.i[61] = 0;
pw.i[62] = 0;
pw.i[63] = 0;
#pragma unroll
for (int idx = 0; idx < 64; idx++)
{
pws_buf[gid].i[idx] = pw_buf[idx];
}
pw.pw_len = pw_len;
generate_pw (pw.i, root_css_buf, markov_css_buf, pw_len, 0, mask80, bits14, bits15, off + gid);
pws_buf[gid].pw_len = pw_len;
pws_buf[gid] = pw;
}

@ -10,26 +10,87 @@
#include "inc_common.cl"
#endif
DECLSPEC void gpu_decompress_entry (GLOBAL_AS pw_idx_t *pws_idx, GLOBAL_AS u32 *pws_comp, pw_t *pw, const u64 gid)
DECLSPEC void gpu_decompress_entry (GLOBAL_AS pw_idx_t *pws_idx, GLOBAL_AS u32 *pws_comp, pw_t *buf, const u64 gid)
{
const u32 off = pws_idx[gid].off;
const u32 cnt = pws_idx[gid].cnt;
const u32 len = pws_idx[gid].len;
#ifdef _unroll
#pragma unroll
#endif
for (u32 i = 0; i < 64; i++)
{
pw->i[i] = 0;
}
pw_t pw;
pw.i[ 0] = 0;
pw.i[ 1] = 0;
pw.i[ 2] = 0;
pw.i[ 3] = 0;
pw.i[ 4] = 0;
pw.i[ 5] = 0;
pw.i[ 6] = 0;
pw.i[ 7] = 0;
pw.i[ 8] = 0;
pw.i[ 9] = 0;
pw.i[10] = 0;
pw.i[11] = 0;
pw.i[12] = 0;
pw.i[13] = 0;
pw.i[14] = 0;
pw.i[15] = 0;
pw.i[16] = 0;
pw.i[17] = 0;
pw.i[18] = 0;
pw.i[19] = 0;
pw.i[20] = 0;
pw.i[21] = 0;
pw.i[22] = 0;
pw.i[23] = 0;
pw.i[24] = 0;
pw.i[25] = 0;
pw.i[26] = 0;
pw.i[27] = 0;
pw.i[28] = 0;
pw.i[29] = 0;
pw.i[30] = 0;
pw.i[31] = 0;
pw.i[32] = 0;
pw.i[33] = 0;
pw.i[34] = 0;
pw.i[35] = 0;
pw.i[36] = 0;
pw.i[37] = 0;
pw.i[38] = 0;
pw.i[39] = 0;
pw.i[40] = 0;
pw.i[41] = 0;
pw.i[42] = 0;
pw.i[43] = 0;
pw.i[44] = 0;
pw.i[45] = 0;
pw.i[46] = 0;
pw.i[47] = 0;
pw.i[48] = 0;
pw.i[49] = 0;
pw.i[50] = 0;
pw.i[51] = 0;
pw.i[52] = 0;
pw.i[53] = 0;
pw.i[54] = 0;
pw.i[55] = 0;
pw.i[56] = 0;
pw.i[57] = 0;
pw.i[58] = 0;
pw.i[59] = 0;
pw.i[60] = 0;
pw.i[61] = 0;
pw.i[62] = 0;
pw.i[63] = 0;
pw.pw_len = len;
for (u32 i = 0, j = off; i < cnt; i++, j++)
{
pw->i[i] = pws_comp[j];
pw.i[i] = pws_comp[j];
}
pw->pw_len = len;
*buf = pw;
}
KERNEL_FQ void gpu_decompress (GLOBAL_AS pw_idx_t *pws_idx, GLOBAL_AS u32 *pws_comp, GLOBAL_AS pw_t *pws_buf, const u64 gid_max)

Loading…
Cancel
Save