diff --git a/OpenCL/inc_hash_sm3.cl b/OpenCL/inc_hash_sm3.cl index b7950352d..f84cce6a8 100644 --- a/OpenCL/inc_hash_sm3.cl +++ b/OpenCL/inc_hash_sm3.cl @@ -98,195 +98,77 @@ DECLSPEC void sm3_transform (PRIVATE_AS const u32 *w0, PRIVATE_AS const u32 *w1, u32 we_t = w3[2]; u32 wf_t = w3[3]; - // LOG_TMP_BUF_16("Buffer before rounds"); - - int i = 0; - // printf("Main loop execution :\n\n"); - // SM3 main loop, the Compression Function (CF) and Message Expansion (ME) are executed - // step-by-step. SM3_R1_S use SM3_FF0 and SM3_GG0 functions for index 0 to 15 and SM3_R2_S - // use SM3_FF1 and SM3_GG1 functions for index 16 to 63. - SM3_R1_S(a, b, c, d, e, f, g, h, SM3_T00, w0_t, w0_t ^ w4_t); - w0_t = SM3_EXPAND_S(w0_t, w7_t, wd_t, w3_t, wa_t); // printf("w0_t : %.08x\n", w0_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R1_S(d, a, b, c, h, e, f, g, SM3_T01, w1_t, w1_t ^ w5_t); - w1_t = SM3_EXPAND_S(w1_t, w8_t, we_t, w4_t, wb_t); // printf("w1_t : %.08x\n", w1_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R1_S(c, d, a, b, g, h, e, f, SM3_T02, w2_t, w2_t ^ w6_t); - w2_t = SM3_EXPAND_S(w2_t, w9_t, wf_t, w5_t, wc_t);// printf("w2_t : %.08x\n", w2_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R1_S(b, c, d, a, f, g, h, e, SM3_T03, w3_t, w3_t ^ w7_t); - w3_t = SM3_EXPAND_S(w3_t, wa_t, w0_t, w6_t, wd_t);// printf("w3_t : %.08x\n", w3_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R1_S(a, b, c, d, e, f, g, h, SM3_T04, w4_t, w4_t ^ w8_t); - w4_t = SM3_EXPAND_S(w4_t, wb_t, w1_t, w7_t, we_t);// printf("w4_t : %.08x\n", w4_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R1_S(d, a, b, c, h, e, f, g, SM3_T05, w5_t, w5_t ^ w9_t); - w5_t = SM3_EXPAND_S(w5_t, wc_t, w2_t, w8_t, wf_t);// printf("w5_t : %.08x\n", w5_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R1_S(c, d, a, b, g, h, e, f, SM3_T06, w6_t, w6_t ^ wa_t); - w6_t = SM3_EXPAND_S(w6_t, wd_t, w3_t, w9_t, w0_t);// printf("w6_t : %.08x\n", w6_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R1_S(b, c, d, a, f, g, h, e, SM3_T07, w7_t, w7_t ^ wb_t); - w7_t = SM3_EXPAND_S(w7_t, we_t, w4_t, wa_t, w1_t);// printf("w7_t : %.08x\n", w7_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R1_S(a, b, c, d, e, f, g, h, SM3_T08, w8_t, w8_t ^ wc_t); - w8_t = SM3_EXPAND_S(w8_t, wf_t, w5_t, wb_t, w2_t);// printf("w8_t : %.08x\n", w8_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R1_S(d, a, b, c, h, e, f, g, SM3_T09, w9_t, w9_t ^ wd_t); - w9_t = SM3_EXPAND_S(w9_t, w0_t, w6_t, wc_t, w3_t);// printf("w9_t : %.08x\n", w9_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R1_S(c, d, a, b, g, h, e, f, SM3_T10, wa_t, wa_t ^ we_t); - wa_t = SM3_EXPAND_S(wa_t, w1_t, w7_t, wd_t, w4_t);// printf("wa_t : %.08x\n", wa_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R1_S(b, c, d, a, f, g, h, e, SM3_T11, wb_t, wb_t ^ wf_t); - wb_t = SM3_EXPAND_S(wb_t, w2_t, w8_t, we_t, w5_t);// printf("wb_t : %.08x\n", wb_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R1_S(a, b, c, d, e, f, g, h, SM3_T12, wc_t, wc_t ^ w0_t); - wc_t = SM3_EXPAND_S(wc_t, w3_t, w9_t, wf_t, w6_t);// printf("wc_t : %.08x\n", wc_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R1_S(d, a, b, c, h, e, f, g, SM3_T13, wd_t, wd_t ^ w1_t); - wd_t = SM3_EXPAND_S(wd_t, w4_t, wa_t, w0_t, w7_t);// printf("wd_t : %.08x\n", wd_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R1_S(c, d, a, b, g, h, e, f, SM3_T14, we_t, we_t ^ w2_t); - we_t = SM3_EXPAND_S(we_t, w5_t, wb_t, w1_t, w8_t);// printf("we_t : %.08x\n", we_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R1_S(b, c, d, a, f, g, h, e, SM3_T15, wf_t, wf_t ^ w3_t); - wf_t = SM3_EXPAND_S(wf_t, w6_t, wc_t, w2_t, w9_t);// printf("wf_t : %.08x\n", wf_t); - //LOG_LOOP("Main loop", i); i++; - // Index = 16, switch to SM3_R2_S - SM3_R2_S(a, b, c, d, e, f, g, h, SM3_T16, w0_t, w0_t ^ w4_t); - w0_t = SM3_EXPAND_S(w0_t, w7_t, wd_t, w3_t, wa_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(d, a, b, c, h, e, f, g, SM3_T17, w1_t, w1_t ^ w5_t); - w1_t = SM3_EXPAND_S(w1_t, w8_t, we_t, w4_t, wb_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(c, d, a, b, g, h, e, f, SM3_T18, w2_t, w2_t ^ w6_t); - w2_t = SM3_EXPAND_S(w2_t, w9_t, wf_t, w5_t, wc_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(b, c, d, a, f, g, h, e, SM3_T19, w3_t, w3_t ^ w7_t); - w3_t = SM3_EXPAND_S(w3_t, wa_t, w0_t, w6_t, wd_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(a, b, c, d, e, f, g, h, SM3_T20, w4_t, w4_t ^ w8_t); - w4_t = SM3_EXPAND_S(w4_t, wb_t, w1_t, w7_t, we_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(d, a, b, c, h, e, f, g, SM3_T21, w5_t, w5_t ^ w9_t); - w5_t = SM3_EXPAND_S(w5_t, wc_t, w2_t, w8_t, wf_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(c, d, a, b, g, h, e, f, SM3_T22, w6_t, w6_t ^ wa_t); - w6_t = SM3_EXPAND_S(w6_t, wd_t, w3_t, w9_t, w0_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(b, c, d, a, f, g, h, e, SM3_T23, w7_t, w7_t ^ wb_t); - w7_t = SM3_EXPAND_S(w7_t, we_t, w4_t, wa_t, w1_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(a, b, c, d, e, f, g, h, SM3_T24, w8_t, w8_t ^ wc_t); - w8_t = SM3_EXPAND_S(w8_t, wf_t, w5_t, wb_t, w2_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(d, a, b, c, h, e, f, g, SM3_T25, w9_t, w9_t ^ wd_t); - w9_t = SM3_EXPAND_S(w9_t, w0_t, w6_t, wc_t, w3_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(c, d, a, b, g, h, e, f, SM3_T26, wa_t, wa_t ^ we_t); - wa_t = SM3_EXPAND_S(wa_t, w1_t, w7_t, wd_t, w4_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(b, c, d, a, f, g, h, e, SM3_T27, wb_t, wb_t ^ wf_t); - wb_t = SM3_EXPAND_S(wb_t, w2_t, w8_t, we_t, w5_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(a, b, c, d, e, f, g, h, SM3_T28, wc_t, wc_t ^ w0_t); - wc_t = SM3_EXPAND_S(wc_t, w3_t, w9_t, wf_t, w6_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(d, a, b, c, h, e, f, g, SM3_T29, wd_t, wd_t ^ w1_t); - wd_t = SM3_EXPAND_S(wd_t, w4_t, wa_t, w0_t, w7_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(c, d, a, b, g, h, e, f, SM3_T30, we_t, we_t ^ w2_t); - we_t = SM3_EXPAND_S(we_t, w5_t, wb_t, w1_t, w8_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(b, c, d, a, f, g, h, e, SM3_T31, wf_t, wf_t ^ w3_t); - wf_t = SM3_EXPAND_S(wf_t, w6_t, wc_t, w2_t, w9_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(a, b, c, d, e, f, g, h, SM3_T32, w0_t, w0_t ^ w4_t); - w0_t = SM3_EXPAND_S(w0_t, w7_t, wd_t, w3_t, wa_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(d, a, b, c, h, e, f, g, SM3_T33, w1_t, w1_t ^ w5_t); - w1_t = SM3_EXPAND_S(w1_t, w8_t, we_t, w4_t, wb_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(c, d, a, b, g, h, e, f, SM3_T34, w2_t, w2_t ^ w6_t); - w2_t = SM3_EXPAND_S(w2_t, w9_t, wf_t, w5_t, wc_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(b, c, d, a, f, g, h, e, SM3_T35, w3_t, w3_t ^ w7_t); - w3_t = SM3_EXPAND_S(w3_t, wa_t, w0_t, w6_t, wd_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(a, b, c, d, e, f, g, h, SM3_T36, w4_t, w4_t ^ w8_t); - w4_t = SM3_EXPAND_S(w4_t, wb_t, w1_t, w7_t, we_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(d, a, b, c, h, e, f, g, SM3_T37, w5_t, w5_t ^ w9_t); - w5_t = SM3_EXPAND_S(w5_t, wc_t, w2_t, w8_t, wf_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(c, d, a, b, g, h, e, f, SM3_T38, w6_t, w6_t ^ wa_t); - w6_t = SM3_EXPAND_S(w6_t, wd_t, w3_t, w9_t, w0_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(b, c, d, a, f, g, h, e, SM3_T39, w7_t, w7_t ^ wb_t); - w7_t = SM3_EXPAND_S(w7_t, we_t, w4_t, wa_t, w1_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(a, b, c, d, e, f, g, h, SM3_T40, w8_t, w8_t ^ wc_t); - w8_t = SM3_EXPAND_S(w8_t, wf_t, w5_t, wb_t, w2_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(d, a, b, c, h, e, f, g, SM3_T41, w9_t, w9_t ^ wd_t); - w9_t = SM3_EXPAND_S(w9_t, w0_t, w6_t, wc_t, w3_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(c, d, a, b, g, h, e, f, SM3_T42, wa_t, wa_t ^ we_t); - wa_t = SM3_EXPAND_S(wa_t, w1_t, w7_t, wd_t, w4_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(b, c, d, a, f, g, h, e, SM3_T43, wb_t, wb_t ^ wf_t); - wb_t = SM3_EXPAND_S(wb_t, w2_t, w8_t, we_t, w5_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(a, b, c, d, e, f, g, h, SM3_T44, wc_t, wc_t ^ w0_t); - wc_t = SM3_EXPAND_S(wc_t, w3_t, w9_t, wf_t, w6_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(d, a, b, c, h, e, f, g, SM3_T45, wd_t, wd_t ^ w1_t); - wd_t = SM3_EXPAND_S(wd_t, w4_t, wa_t, w0_t, w7_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(c, d, a, b, g, h, e, f, SM3_T46, we_t, we_t ^ w2_t); - we_t = SM3_EXPAND_S(we_t, w5_t, wb_t, w1_t, w8_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(b, c, d, a, f, g, h, e, SM3_T47, wf_t, wf_t ^ w3_t); - wf_t = SM3_EXPAND_S(wf_t, w6_t, wc_t, w2_t, w9_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(a, b, c, d, e, f, g, h, SM3_T48, w0_t, w0_t ^ w4_t); - w0_t = SM3_EXPAND_S(w0_t, w7_t, wd_t, w3_t, wa_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(d, a, b, c, h, e, f, g, SM3_T49, w1_t, w1_t ^ w5_t); - w1_t = SM3_EXPAND_S(w1_t, w8_t, we_t, w4_t, wb_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(c, d, a, b, g, h, e, f, SM3_T50, w2_t, w2_t ^ w6_t); - w2_t = SM3_EXPAND_S(w2_t, w9_t, wf_t, w5_t, wc_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(b, c, d, a, f, g, h, e, SM3_T51, w3_t, w3_t ^ w7_t); - w3_t = SM3_EXPAND_S(w3_t, wa_t, w0_t, w6_t, wd_t); - //LOG_LOOP("Main loop", i); i++; - // No more ME for index 52 to 63. - SM3_R2_S(a, b, c, d, e, f, g, h, SM3_T52, w4_t, w4_t ^ w8_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(d, a, b, c, h, e, f, g, SM3_T53, w5_t, w5_t ^ w9_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(c, d, a, b, g, h, e, f, SM3_T54, w6_t, w6_t ^ wa_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(b, c, d, a, f, g, h, e, SM3_T55, w7_t, w7_t ^ wb_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(a, b, c, d, e, f, g, h, SM3_T56, w8_t, w8_t ^ wc_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(d, a, b, c, h, e, f, g, SM3_T57, w9_t, w9_t ^ wd_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(c, d, a, b, g, h, e, f, SM3_T58, wa_t, wa_t ^ we_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(b, c, d, a, f, g, h, e, SM3_T59, wb_t, wb_t ^ wf_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(a, b, c, d, e, f, g, h, SM3_T60, wc_t, wc_t ^ w0_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(d, a, b, c, h, e, f, g, SM3_T61, wd_t, wd_t ^ w1_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(c, d, a, b, g, h, e, f, SM3_T62, we_t, we_t ^ w2_t); - //LOG_LOOP("Main loop", i); i++; - SM3_R2_S(b, c, d, a, f, g, h, e, SM3_T63, wf_t, wf_t ^ w3_t); - //LOG_LOOP("Main loop", i); i++; + // SM3 main loop, composed of 64 rounds (0 to 63). + // The Compression Function (CF) and Message Expansion (ME) are executed step-by-step. + // SM3_ROUND1_S use SM3_FF0 and SM3_GG0 functions for index 0 to 15 and SM3_ROUND2_S use SM3_FF1 and SM3_GG1 functions for index 16 to 63. + // Rounds from 0 to 15 + SM3_ROUND1_S(a, b, c, d, e, f, g, h, SM3_T00, w0_t, w0_t ^ w4_t); + SM3_ROUND1_S(d, a, b, c, h, e, f, g, SM3_T01, w1_t, w1_t ^ w5_t); + SM3_ROUND1_S(c, d, a, b, g, h, e, f, SM3_T02, w2_t, w2_t ^ w6_t); + SM3_ROUND1_S(b, c, d, a, f, g, h, e, SM3_T03, w3_t, w3_t ^ w7_t); + SM3_ROUND1_S(a, b, c, d, e, f, g, h, SM3_T04, w4_t, w4_t ^ w8_t); + SM3_ROUND1_S(d, a, b, c, h, e, f, g, SM3_T05, w5_t, w5_t ^ w9_t); + SM3_ROUND1_S(c, d, a, b, g, h, e, f, SM3_T06, w6_t, w6_t ^ wa_t); + SM3_ROUND1_S(b, c, d, a, f, g, h, e, SM3_T07, w7_t, w7_t ^ wb_t); + SM3_ROUND1_S(a, b, c, d, e, f, g, h, SM3_T08, w8_t, w8_t ^ wc_t); + SM3_ROUND1_S(d, a, b, c, h, e, f, g, SM3_T09, w9_t, w9_t ^ wd_t); + SM3_ROUND1_S(c, d, a, b, g, h, e, f, SM3_T10, wa_t, wa_t ^ we_t); + SM3_ROUND1_S(b, c, d, a, f, g, h, e, SM3_T11, wb_t, wb_t ^ wf_t); + // Message Expansion start here because the algorithm need values computed by message expansion from the 12th round + w0_t = SM3_EXPAND_S(w0_t, w7_t, wd_t, w3_t, wa_t); SM3_ROUND1_S(a, b, c, d, e, f, g, h, SM3_T12, wc_t, wc_t ^ w0_t); + w1_t = SM3_EXPAND_S(w1_t, w8_t, we_t, w4_t, wb_t); SM3_ROUND1_S(d, a, b, c, h, e, f, g, SM3_T13, wd_t, wd_t ^ w1_t); + w2_t = SM3_EXPAND_S(w2_t, w9_t, wf_t, w5_t, wc_t); SM3_ROUND1_S(c, d, a, b, g, h, e, f, SM3_T14, we_t, we_t ^ w2_t); + w3_t = SM3_EXPAND_S(w3_t, wa_t, w0_t, w6_t, wd_t); SM3_ROUND1_S(b, c, d, a, f, g, h, e, SM3_T15, wf_t, wf_t ^ w3_t); + + // Rounds from 16 to 63, switch to SM3_ROUND2_S + w4_t = SM3_EXPAND_S(w4_t, wb_t, w1_t, w7_t, we_t); SM3_ROUND2_S(a, b, c, d, e, f, g, h, SM3_T16, w0_t, w0_t ^ w4_t); + w5_t = SM3_EXPAND_S(w5_t, wc_t, w2_t, w8_t, wf_t); SM3_ROUND2_S(d, a, b, c, h, e, f, g, SM3_T17, w1_t, w1_t ^ w5_t); + w6_t = SM3_EXPAND_S(w6_t, wd_t, w3_t, w9_t, w0_t); SM3_ROUND2_S(c, d, a, b, g, h, e, f, SM3_T18, w2_t, w2_t ^ w6_t); + w7_t = SM3_EXPAND_S(w7_t, we_t, w4_t, wa_t, w1_t); SM3_ROUND2_S(b, c, d, a, f, g, h, e, SM3_T19, w3_t, w3_t ^ w7_t); + w8_t = SM3_EXPAND_S(w8_t, wf_t, w5_t, wb_t, w2_t); SM3_ROUND2_S(a, b, c, d, e, f, g, h, SM3_T20, w4_t, w4_t ^ w8_t); + w9_t = SM3_EXPAND_S(w9_t, w0_t, w6_t, wc_t, w3_t); SM3_ROUND2_S(d, a, b, c, h, e, f, g, SM3_T21, w5_t, w5_t ^ w9_t); + wa_t = SM3_EXPAND_S(wa_t, w1_t, w7_t, wd_t, w4_t); SM3_ROUND2_S(c, d, a, b, g, h, e, f, SM3_T22, w6_t, w6_t ^ wa_t); + wb_t = SM3_EXPAND_S(wb_t, w2_t, w8_t, we_t, w5_t); SM3_ROUND2_S(b, c, d, a, f, g, h, e, SM3_T23, w7_t, w7_t ^ wb_t); + wc_t = SM3_EXPAND_S(wc_t, w3_t, w9_t, wf_t, w6_t); SM3_ROUND2_S(a, b, c, d, e, f, g, h, SM3_T24, w8_t, w8_t ^ wc_t); + wd_t = SM3_EXPAND_S(wd_t, w4_t, wa_t, w0_t, w7_t); SM3_ROUND2_S(d, a, b, c, h, e, f, g, SM3_T25, w9_t, w9_t ^ wd_t); + we_t = SM3_EXPAND_S(we_t, w5_t, wb_t, w1_t, w8_t); SM3_ROUND2_S(c, d, a, b, g, h, e, f, SM3_T26, wa_t, wa_t ^ we_t); + wf_t = SM3_EXPAND_S(wf_t, w6_t, wc_t, w2_t, w9_t); SM3_ROUND2_S(b, c, d, a, f, g, h, e, SM3_T27, wb_t, wb_t ^ wf_t); + w0_t = SM3_EXPAND_S(w0_t, w7_t, wd_t, w3_t, wa_t); SM3_ROUND2_S(a, b, c, d, e, f, g, h, SM3_T28, wc_t, wc_t ^ w0_t); + w1_t = SM3_EXPAND_S(w1_t, w8_t, we_t, w4_t, wb_t); SM3_ROUND2_S(d, a, b, c, h, e, f, g, SM3_T29, wd_t, wd_t ^ w1_t); + w2_t = SM3_EXPAND_S(w2_t, w9_t, wf_t, w5_t, wc_t); SM3_ROUND2_S(c, d, a, b, g, h, e, f, SM3_T30, we_t, we_t ^ w2_t); + w3_t = SM3_EXPAND_S(w3_t, wa_t, w0_t, w6_t, wd_t); SM3_ROUND2_S(b, c, d, a, f, g, h, e, SM3_T31, wf_t, wf_t ^ w3_t); + w4_t = SM3_EXPAND_S(w4_t, wb_t, w1_t, w7_t, we_t); SM3_ROUND2_S(a, b, c, d, e, f, g, h, SM3_T32, w0_t, w0_t ^ w4_t); + w5_t = SM3_EXPAND_S(w5_t, wc_t, w2_t, w8_t, wf_t); SM3_ROUND2_S(d, a, b, c, h, e, f, g, SM3_T33, w1_t, w1_t ^ w5_t); + w6_t = SM3_EXPAND_S(w6_t, wd_t, w3_t, w9_t, w0_t); SM3_ROUND2_S(c, d, a, b, g, h, e, f, SM3_T34, w2_t, w2_t ^ w6_t); + w7_t = SM3_EXPAND_S(w7_t, we_t, w4_t, wa_t, w1_t); SM3_ROUND2_S(b, c, d, a, f, g, h, e, SM3_T35, w3_t, w3_t ^ w7_t); + w8_t = SM3_EXPAND_S(w8_t, wf_t, w5_t, wb_t, w2_t); SM3_ROUND2_S(a, b, c, d, e, f, g, h, SM3_T36, w4_t, w4_t ^ w8_t); + w9_t = SM3_EXPAND_S(w9_t, w0_t, w6_t, wc_t, w3_t); SM3_ROUND2_S(d, a, b, c, h, e, f, g, SM3_T37, w5_t, w5_t ^ w9_t); + wa_t = SM3_EXPAND_S(wa_t, w1_t, w7_t, wd_t, w4_t); SM3_ROUND2_S(c, d, a, b, g, h, e, f, SM3_T38, w6_t, w6_t ^ wa_t); + wb_t = SM3_EXPAND_S(wb_t, w2_t, w8_t, we_t, w5_t); SM3_ROUND2_S(b, c, d, a, f, g, h, e, SM3_T39, w7_t, w7_t ^ wb_t); + wc_t = SM3_EXPAND_S(wc_t, w3_t, w9_t, wf_t, w6_t); SM3_ROUND2_S(a, b, c, d, e, f, g, h, SM3_T40, w8_t, w8_t ^ wc_t); + wd_t = SM3_EXPAND_S(wd_t, w4_t, wa_t, w0_t, w7_t); SM3_ROUND2_S(d, a, b, c, h, e, f, g, SM3_T41, w9_t, w9_t ^ wd_t); + we_t = SM3_EXPAND_S(we_t, w5_t, wb_t, w1_t, w8_t); SM3_ROUND2_S(c, d, a, b, g, h, e, f, SM3_T42, wa_t, wa_t ^ we_t); + wf_t = SM3_EXPAND_S(wf_t, w6_t, wc_t, w2_t, w9_t); SM3_ROUND2_S(b, c, d, a, f, g, h, e, SM3_T43, wb_t, wb_t ^ wf_t); + w0_t = SM3_EXPAND_S(w0_t, w7_t, wd_t, w3_t, wa_t); SM3_ROUND2_S(a, b, c, d, e, f, g, h, SM3_T44, wc_t, wc_t ^ w0_t); + w1_t = SM3_EXPAND_S(w1_t, w8_t, we_t, w4_t, wb_t); SM3_ROUND2_S(d, a, b, c, h, e, f, g, SM3_T45, wd_t, wd_t ^ w1_t); + w2_t = SM3_EXPAND_S(w2_t, w9_t, wf_t, w5_t, wc_t); SM3_ROUND2_S(c, d, a, b, g, h, e, f, SM3_T46, we_t, we_t ^ w2_t); + w3_t = SM3_EXPAND_S(w3_t, wa_t, w0_t, w6_t, wd_t); SM3_ROUND2_S(b, c, d, a, f, g, h, e, SM3_T47, wf_t, wf_t ^ w3_t); + w4_t = SM3_EXPAND_S(w4_t, wb_t, w1_t, w7_t, we_t); SM3_ROUND2_S(a, b, c, d, e, f, g, h, SM3_T48, w0_t, w0_t ^ w4_t); + w5_t = SM3_EXPAND_S(w5_t, wc_t, w2_t, w8_t, wf_t); SM3_ROUND2_S(d, a, b, c, h, e, f, g, SM3_T49, w1_t, w1_t ^ w5_t); + w6_t = SM3_EXPAND_S(w6_t, wd_t, w3_t, w9_t, w0_t); SM3_ROUND2_S(c, d, a, b, g, h, e, f, SM3_T50, w2_t, w2_t ^ w6_t); + w7_t = SM3_EXPAND_S(w7_t, we_t, w4_t, wa_t, w1_t); SM3_ROUND2_S(b, c, d, a, f, g, h, e, SM3_T51, w3_t, w3_t ^ w7_t); + w8_t = SM3_EXPAND_S(w8_t, wf_t, w5_t, wb_t, w2_t); SM3_ROUND2_S(a, b, c, d, e, f, g, h, SM3_T52, w4_t, w4_t ^ w8_t); + w9_t = SM3_EXPAND_S(w9_t, w0_t, w6_t, wc_t, w3_t); SM3_ROUND2_S(d, a, b, c, h, e, f, g, SM3_T53, w5_t, w5_t ^ w9_t); + wa_t = SM3_EXPAND_S(wa_t, w1_t, w7_t, wd_t, w4_t); SM3_ROUND2_S(c, d, a, b, g, h, e, f, SM3_T54, w6_t, w6_t ^ wa_t); + wb_t = SM3_EXPAND_S(wb_t, w2_t, w8_t, we_t, w5_t); SM3_ROUND2_S(b, c, d, a, f, g, h, e, SM3_T55, w7_t, w7_t ^ wb_t); + wc_t = SM3_EXPAND_S(wc_t, w3_t, w9_t, wf_t, w6_t); SM3_ROUND2_S(a, b, c, d, e, f, g, h, SM3_T56, w8_t, w8_t ^ wc_t); + wd_t = SM3_EXPAND_S(wd_t, w4_t, wa_t, w0_t, w7_t); SM3_ROUND2_S(d, a, b, c, h, e, f, g, SM3_T57, w9_t, w9_t ^ wd_t); + we_t = SM3_EXPAND_S(we_t, w5_t, wb_t, w1_t, w8_t); SM3_ROUND2_S(c, d, a, b, g, h, e, f, SM3_T58, wa_t, wa_t ^ we_t); + wf_t = SM3_EXPAND_S(wf_t, w6_t, wc_t, w2_t, w9_t); SM3_ROUND2_S(b, c, d, a, f, g, h, e, SM3_T59, wb_t, wb_t ^ wf_t); + w0_t = SM3_EXPAND_S(w0_t, w7_t, wd_t, w3_t, wa_t); SM3_ROUND2_S(a, b, c, d, e, f, g, h, SM3_T60, wc_t, wc_t ^ w0_t); + w1_t = SM3_EXPAND_S(w1_t, w8_t, we_t, w4_t, wb_t); SM3_ROUND2_S(d, a, b, c, h, e, f, g, SM3_T61, wd_t, wd_t ^ w1_t); + w2_t = SM3_EXPAND_S(w2_t, w9_t, wf_t, w5_t, wc_t); SM3_ROUND2_S(c, d, a, b, g, h, e, f, SM3_T62, we_t, we_t ^ w2_t); + w3_t = SM3_EXPAND_S(w3_t, wa_t, w0_t, w6_t, wd_t); SM3_ROUND2_S(b, c, d, a, f, g, h, e, SM3_T63, wf_t, wf_t ^ w3_t); // LOG_BUF_16("\nBuffer after rounds of SM3"); @@ -1180,128 +1062,78 @@ DECLSPEC void sm3_transform_vector (PRIVATE_AS const u32x *w0, PRIVATE_AS const u32x wd_t = w3[1]; u32x we_t = w3[2]; u32x wf_t = w3[3]; - - // SM3 main loop, the Compression Function (CF) and Message Expansion (ME) are executed - // step-by-step. SM3_R1 use SM3_FF0 and SM3_GG0 functions for index 0 to 15 and SM3_R2 - // use SM3_FF1 and SM3_GG1 functions for index 16 to 63. - SM3_R1(a, b, c, d, e, f, g, h, SM3_T00, w0_t, w0_t ^ w4_t); - w0_t = SM3_EXPAND(w0_t, w7_t, wd_t, w3_t, wa_t); - SM3_R1(d, a, b, c, h, e, f, g, SM3_T01, w1_t, w1_t ^ w5_t); - w1_t = SM3_EXPAND(w1_t, w8_t, we_t, w4_t, wb_t); - SM3_R1(c, d, a, b, g, h, e, f, SM3_T02, w2_t, w2_t ^ w6_t); - w2_t = SM3_EXPAND(w2_t, w9_t, wf_t, w5_t, wc_t); - SM3_R1(b, c, d, a, f, g, h, e, SM3_T03, w3_t, w3_t ^ w7_t); - w3_t = SM3_EXPAND(w3_t, wa_t, w0_t, w6_t, wd_t); - SM3_R1(a, b, c, d, e, f, g, h, SM3_T04, w4_t, w4_t ^ w8_t); - w4_t = SM3_EXPAND(w4_t, wb_t, w1_t, w7_t, we_t); - SM3_R1(d, a, b, c, h, e, f, g, SM3_T05, w5_t, w5_t ^ w9_t); - w5_t = SM3_EXPAND(w5_t, wc_t, w2_t, w8_t, wf_t); - SM3_R1(c, d, a, b, g, h, e, f, SM3_T06, w6_t, w6_t ^ wa_t); - w6_t = SM3_EXPAND(w6_t, wd_t, w3_t, w9_t, w0_t); - SM3_R1(b, c, d, a, f, g, h, e, SM3_T07, w7_t, w7_t ^ wb_t); - w7_t = SM3_EXPAND(w7_t, we_t, w4_t, wa_t, w1_t); - SM3_R1(a, b, c, d, e, f, g, h, SM3_T08, w8_t, w8_t ^ wc_t); - w8_t = SM3_EXPAND(w8_t, wf_t, w5_t, wb_t, w2_t); - SM3_R1(d, a, b, c, h, e, f, g, SM3_T09, w9_t, w9_t ^ wd_t); - w9_t = SM3_EXPAND(w9_t, w0_t, w6_t, wc_t, w3_t); - SM3_R1(c, d, a, b, g, h, e, f, SM3_T10, wa_t, wa_t ^ we_t); - wa_t = SM3_EXPAND(wa_t, w1_t, w7_t, wd_t, w4_t); - SM3_R1(b, c, d, a, f, g, h, e, SM3_T11, wb_t, wb_t ^ wf_t); - wb_t = SM3_EXPAND(wb_t, w2_t, w8_t, we_t, w5_t); - SM3_R1(a, b, c, d, e, f, g, h, SM3_T12, wc_t, wc_t ^ w0_t); - wc_t = SM3_EXPAND(wc_t, w3_t, w9_t, wf_t, w6_t); - SM3_R1(d, a, b, c, h, e, f, g, SM3_T13, wd_t, wd_t ^ w1_t); - wd_t = SM3_EXPAND(wd_t, w4_t, wa_t, w0_t, w7_t); - SM3_R1(c, d, a, b, g, h, e, f, SM3_T14, we_t, we_t ^ w2_t); - we_t = SM3_EXPAND(we_t, w5_t, wb_t, w1_t, w8_t); - SM3_R1(b, c, d, a, f, g, h, e, SM3_T15, wf_t, wf_t ^ w3_t); - wf_t = SM3_EXPAND(wf_t, w6_t, wc_t, w2_t, w9_t); - // Index = 16, switch to SM3_R2 - SM3_R2(a, b, c, d, e, f, g, h, SM3_T16, w0_t, w0_t ^ w4_t); - w0_t = SM3_EXPAND(w0_t, w7_t, wd_t, w3_t, wa_t); - SM3_R2(d, a, b, c, h, e, f, g, SM3_T17, w1_t, w1_t ^ w5_t); - w1_t = SM3_EXPAND(w1_t, w8_t, we_t, w4_t, wb_t); - SM3_R2(c, d, a, b, g, h, e, f, SM3_T18, w2_t, w2_t ^ w6_t); - w2_t = SM3_EXPAND(w2_t, w9_t, wf_t, w5_t, wc_t); - SM3_R2(b, c, d, a, f, g, h, e, SM3_T19, w3_t, w3_t ^ w7_t); - w3_t = SM3_EXPAND(w3_t, wa_t, w0_t, w6_t, wd_t); - SM3_R2(a, b, c, d, e, f, g, h, SM3_T20, w4_t, w4_t ^ w8_t); - w4_t = SM3_EXPAND(w4_t, wb_t, w1_t, w7_t, we_t); - SM3_R2(d, a, b, c, h, e, f, g, SM3_T21, w5_t, w5_t ^ w9_t); - w5_t = SM3_EXPAND(w5_t, wc_t, w2_t, w8_t, wf_t); - SM3_R2(c, d, a, b, g, h, e, f, SM3_T22, w6_t, w6_t ^ wa_t); - w6_t = SM3_EXPAND(w6_t, wd_t, w3_t, w9_t, w0_t); - SM3_R2(b, c, d, a, f, g, h, e, SM3_T23, w7_t, w7_t ^ wb_t); - w7_t = SM3_EXPAND(w7_t, we_t, w4_t, wa_t, w1_t); - SM3_R2(a, b, c, d, e, f, g, h, SM3_T24, w8_t, w8_t ^ wc_t); - w8_t = SM3_EXPAND(w8_t, wf_t, w5_t, wb_t, w2_t); - SM3_R2(d, a, b, c, h, e, f, g, SM3_T25, w9_t, w9_t ^ wd_t); - w9_t = SM3_EXPAND(w9_t, w0_t, w6_t, wc_t, w3_t); - SM3_R2(c, d, a, b, g, h, e, f, SM3_T26, wa_t, wa_t ^ we_t); - wa_t = SM3_EXPAND(wa_t, w1_t, w7_t, wd_t, w4_t); - SM3_R2(b, c, d, a, f, g, h, e, SM3_T27, wb_t, wb_t ^ wf_t); - wb_t = SM3_EXPAND(wb_t, w2_t, w8_t, we_t, w5_t); - SM3_R2(a, b, c, d, e, f, g, h, SM3_T28, wc_t, wc_t ^ w0_t); - wc_t = SM3_EXPAND(wc_t, w3_t, w9_t, wf_t, w6_t); - SM3_R2(d, a, b, c, h, e, f, g, SM3_T29, wd_t, wd_t ^ w1_t); - wd_t = SM3_EXPAND(wd_t, w4_t, wa_t, w0_t, w7_t); - SM3_R2(c, d, a, b, g, h, e, f, SM3_T30, we_t, we_t ^ w2_t); - we_t = SM3_EXPAND(we_t, w5_t, wb_t, w1_t, w8_t); - SM3_R2(b, c, d, a, f, g, h, e, SM3_T31, wf_t, wf_t ^ w3_t); - wf_t = SM3_EXPAND(wf_t, w6_t, wc_t, w2_t, w9_t); - SM3_R2(a, b, c, d, e, f, g, h, SM3_T32, w0_t, w0_t ^ w4_t); - w0_t = SM3_EXPAND(w0_t, w7_t, wd_t, w3_t, wa_t); - SM3_R2(d, a, b, c, h, e, f, g, SM3_T33, w1_t, w1_t ^ w5_t); - w1_t = SM3_EXPAND(w1_t, w8_t, we_t, w4_t, wb_t); - SM3_R2(c, d, a, b, g, h, e, f, SM3_T34, w2_t, w2_t ^ w6_t); - w2_t = SM3_EXPAND(w2_t, w9_t, wf_t, w5_t, wc_t); - SM3_R2(b, c, d, a, f, g, h, e, SM3_T35, w3_t, w3_t ^ w7_t); - w3_t = SM3_EXPAND(w3_t, wa_t, w0_t, w6_t, wd_t); - SM3_R2(a, b, c, d, e, f, g, h, SM3_T36, w4_t, w4_t ^ w8_t); - w4_t = SM3_EXPAND(w4_t, wb_t, w1_t, w7_t, we_t); - SM3_R2(d, a, b, c, h, e, f, g, SM3_T37, w5_t, w5_t ^ w9_t); - w5_t = SM3_EXPAND(w5_t, wc_t, w2_t, w8_t, wf_t); - SM3_R2(c, d, a, b, g, h, e, f, SM3_T38, w6_t, w6_t ^ wa_t); - w6_t = SM3_EXPAND(w6_t, wd_t, w3_t, w9_t, w0_t); - SM3_R2(b, c, d, a, f, g, h, e, SM3_T39, w7_t, w7_t ^ wb_t); - w7_t = SM3_EXPAND(w7_t, we_t, w4_t, wa_t, w1_t); - SM3_R2(a, b, c, d, e, f, g, h, SM3_T40, w8_t, w8_t ^ wc_t); - w8_t = SM3_EXPAND(w8_t, wf_t, w5_t, wb_t, w2_t); - SM3_R2(d, a, b, c, h, e, f, g, SM3_T41, w9_t, w9_t ^ wd_t); - w9_t = SM3_EXPAND(w9_t, w0_t, w6_t, wc_t, w3_t); - SM3_R2(c, d, a, b, g, h, e, f, SM3_T42, wa_t, wa_t ^ we_t); - wa_t = SM3_EXPAND(wa_t, w1_t, w7_t, wd_t, w4_t); - SM3_R2(b, c, d, a, f, g, h, e, SM3_T43, wb_t, wb_t ^ wf_t); - wb_t = SM3_EXPAND(wb_t, w2_t, w8_t, we_t, w5_t); - SM3_R2(a, b, c, d, e, f, g, h, SM3_T44, wc_t, wc_t ^ w0_t); - wc_t = SM3_EXPAND(wc_t, w3_t, w9_t, wf_t, w6_t); - SM3_R2(d, a, b, c, h, e, f, g, SM3_T45, wd_t, wd_t ^ w1_t); - wd_t = SM3_EXPAND(wd_t, w4_t, wa_t, w0_t, w7_t); - SM3_R2(c, d, a, b, g, h, e, f, SM3_T46, we_t, we_t ^ w2_t); - we_t = SM3_EXPAND(we_t, w5_t, wb_t, w1_t, w8_t); - SM3_R2(b, c, d, a, f, g, h, e, SM3_T47, wf_t, wf_t ^ w3_t); - wf_t = SM3_EXPAND(wf_t, w6_t, wc_t, w2_t, w9_t); - SM3_R2(a, b, c, d, e, f, g, h, SM3_T48, w0_t, w0_t ^ w4_t); - w0_t = SM3_EXPAND(w0_t, w7_t, wd_t, w3_t, wa_t); - SM3_R2(d, a, b, c, h, e, f, g, SM3_T49, w1_t, w1_t ^ w5_t); - w1_t = SM3_EXPAND(w1_t, w8_t, we_t, w4_t, wb_t); - SM3_R2(c, d, a, b, g, h, e, f, SM3_T50, w2_t, w2_t ^ w6_t); - w2_t = SM3_EXPAND(w2_t, w9_t, wf_t, w5_t, wc_t); - SM3_R2(b, c, d, a, f, g, h, e, SM3_T51, w3_t, w3_t ^ w7_t); - w3_t = SM3_EXPAND(w3_t, wa_t, w0_t, w6_t, wd_t); - // No more ME for index 52 to 63. - SM3_R2(a, b, c, d, e, f, g, h, SM3_T52, w4_t, w4_t ^ w8_t); - SM3_R2(d, a, b, c, h, e, f, g, SM3_T53, w5_t, w5_t ^ w9_t); - SM3_R2(c, d, a, b, g, h, e, f, SM3_T54, w6_t, w6_t ^ wa_t); - SM3_R2(b, c, d, a, f, g, h, e, SM3_T55, w7_t, w7_t ^ wb_t); - SM3_R2(a, b, c, d, e, f, g, h, SM3_T56, w8_t, w8_t ^ wc_t); - SM3_R2(d, a, b, c, h, e, f, g, SM3_T57, w9_t, w9_t ^ wd_t); - SM3_R2(c, d, a, b, g, h, e, f, SM3_T58, wa_t, wa_t ^ we_t); - SM3_R2(b, c, d, a, f, g, h, e, SM3_T59, wb_t, wb_t ^ wf_t); - SM3_R2(a, b, c, d, e, f, g, h, SM3_T60, wc_t, wc_t ^ w0_t); - SM3_R2(d, a, b, c, h, e, f, g, SM3_T61, wd_t, wd_t ^ w1_t); - SM3_R2(c, d, a, b, g, h, e, f, SM3_T62, we_t, we_t ^ w2_t); - SM3_R2(b, c, d, a, f, g, h, e, SM3_T63, wf_t, wf_t ^ w3_t); + + // SM3 main loop, composed of 64 rounds (0 to 63). + // The Compression Function (CF) and Message Expansion (ME) are executed step-by-step. + // SM3_ROUND1 use SM3_FF0 and SM3_GG0 functions for index 0 to 15 and SM3_ROUND2 use SM3_FF1 and SM3_GG1 functions for index 16 to 63. + // Rounds from 0 to 15 + SM3_ROUND1(a, b, c, d, e, f, g, h, SM3_T0, w0_t, w0_t ^ w4_t); + SM3_ROUND1(d, a, b, c, h, e, f, g, SM3_T1, w1_t, w1_t ^ w5_t); + SM3_ROUND1(c, d, a, b, g, h, e, f, SM3_T2, w2_t, w2_t ^ w6_t); + SM3_ROUND1(b, c, d, a, f, g, h, e, SM3_T3, w3_t, w3_t ^ w7_t); + SM3_ROUND1(a, b, c, d, e, f, g, h, SM3_T4, w4_t, w4_t ^ w8_t); + SM3_ROUND1(d, a, b, c, h, e, f, g, SM3_T5, w5_t, w5_t ^ w9_t); + SM3_ROUND1(c, d, a, b, g, h, e, f, SM3_T6, w6_t, w6_t ^ wa_t); + SM3_ROUND1(b, c, d, a, f, g, h, e, SM3_T7, w7_t, w7_t ^ wb_t); + SM3_ROUND1(a, b, c, d, e, f, g, h, SM3_T8, w8_t, w8_t ^ wc_t); + SM3_ROUND1(d, a, b, c, h, e, f, g, SM3_T9, w9_t, w9_t ^ wd_t); + SM3_ROUND1(c, d, a, b, g, h, e, f, SM3_T10, wa_t, wa_t ^ we_t); + SM3_ROUND1(b, c, d, a, f, g, h, e, SM3_T11, wb_t, wb_t ^ wf_t); + // Message Expansion start here because the algorithm need values computed by message expansion from the 12th round + w0_t = SM3_EXPAND(w0_t, w7_t, wd_t, w3_t, wa_t); SM3_ROUND1(a, b, c, d, e, f, g, h, SM3_T12, wc_t, wc_t ^ w0_t); + w1_t = SM3_EXPAND(w1_t, w8_t, we_t, w4_t, wb_t); SM3_ROUND1(d, a, b, c, h, e, f, g, SM3_T13, wd_t, wd_t ^ w1_t); + w2_t = SM3_EXPAND(w2_t, w9_t, wf_t, w5_t, wc_t); SM3_ROUND1(c, d, a, b, g, h, e, f, SM3_T14, we_t, we_t ^ w2_t); + w3_t = SM3_EXPAND(w3_t, wa_t, w0_t, w6_t, wd_t); SM3_ROUND1(b, c, d, a, f, g, h, e, SM3_T15, wf_t, wf_t ^ w3_t); + + // Rounds from 16 to 63, switch to SM3_ROUND2 + w4_t = SM3_EXPAND(w4_t, wb_t, w1_t, w7_t, we_t); SM3_ROUND2(a, b, c, d, e, f, g, h, SM3_T16, w0_t, w0_t ^ w4_t); + w5_t = SM3_EXPAND(w5_t, wc_t, w2_t, w8_t, wf_t); SM3_ROUND2(d, a, b, c, h, e, f, g, SM3_T17, w1_t, w1_t ^ w5_t); + w6_t = SM3_EXPAND(w6_t, wd_t, w3_t, w9_t, w0_t); SM3_ROUND2(c, d, a, b, g, h, e, f, SM3_T18, w2_t, w2_t ^ w6_t); + w7_t = SM3_EXPAND(w7_t, we_t, w4_t, wa_t, w1_t); SM3_ROUND2(b, c, d, a, f, g, h, e, SM3_T19, w3_t, w3_t ^ w7_t); + w8_t = SM3_EXPAND(w8_t, wf_t, w5_t, wb_t, w2_t); SM3_ROUND2(a, b, c, d, e, f, g, h, SM3_T20, w4_t, w4_t ^ w8_t); + w9_t = SM3_EXPAND(w9_t, w0_t, w6_t, wc_t, w3_t); SM3_ROUND2(d, a, b, c, h, e, f, g, SM3_T21, w5_t, w5_t ^ w9_t); + wa_t = SM3_EXPAND(wa_t, w1_t, w7_t, wd_t, w4_t); SM3_ROUND2(c, d, a, b, g, h, e, f, SM3_T22, w6_t, w6_t ^ wa_t); + wb_t = SM3_EXPAND(wb_t, w2_t, w8_t, we_t, w5_t); SM3_ROUND2(b, c, d, a, f, g, h, e, SM3_T23, w7_t, w7_t ^ wb_t); + wc_t = SM3_EXPAND(wc_t, w3_t, w9_t, wf_t, w6_t); SM3_ROUND2(a, b, c, d, e, f, g, h, SM3_T24, w8_t, w8_t ^ wc_t); + wd_t = SM3_EXPAND(wd_t, w4_t, wa_t, w0_t, w7_t); SM3_ROUND2(d, a, b, c, h, e, f, g, SM3_T25, w9_t, w9_t ^ wd_t); + we_t = SM3_EXPAND(we_t, w5_t, wb_t, w1_t, w8_t); SM3_ROUND2(c, d, a, b, g, h, e, f, SM3_T26, wa_t, wa_t ^ we_t); + wf_t = SM3_EXPAND(wf_t, w6_t, wc_t, w2_t, w9_t); SM3_ROUND2(b, c, d, a, f, g, h, e, SM3_T27, wb_t, wb_t ^ wf_t); + w0_t = SM3_EXPAND(w0_t, w7_t, wd_t, w3_t, wa_t); SM3_ROUND2(a, b, c, d, e, f, g, h, SM3_T28, wc_t, wc_t ^ w0_t); + w1_t = SM3_EXPAND(w1_t, w8_t, we_t, w4_t, wb_t); SM3_ROUND2(d, a, b, c, h, e, f, g, SM3_T29, wd_t, wd_t ^ w1_t); + w2_t = SM3_EXPAND(w2_t, w9_t, wf_t, w5_t, wc_t); SM3_ROUND2(c, d, a, b, g, h, e, f, SM3_T30, we_t, we_t ^ w2_t); + w3_t = SM3_EXPAND(w3_t, wa_t, w0_t, w6_t, wd_t); SM3_ROUND2(b, c, d, a, f, g, h, e, SM3_T31, wf_t, wf_t ^ w3_t); + w4_t = SM3_EXPAND(w4_t, wb_t, w1_t, w7_t, we_t); SM3_ROUND2(a, b, c, d, e, f, g, h, SM3_T32, w0_t, w0_t ^ w4_t); + w5_t = SM3_EXPAND(w5_t, wc_t, w2_t, w8_t, wf_t); SM3_ROUND2(d, a, b, c, h, e, f, g, SM3_T33, w1_t, w1_t ^ w5_t); + w6_t = SM3_EXPAND(w6_t, wd_t, w3_t, w9_t, w0_t); SM3_ROUND2(c, d, a, b, g, h, e, f, SM3_T34, w2_t, w2_t ^ w6_t); + w7_t = SM3_EXPAND(w7_t, we_t, w4_t, wa_t, w1_t); SM3_ROUND2(b, c, d, a, f, g, h, e, SM3_T35, w3_t, w3_t ^ w7_t); + w8_t = SM3_EXPAND(w8_t, wf_t, w5_t, wb_t, w2_t); SM3_ROUND2(a, b, c, d, e, f, g, h, SM3_T36, w4_t, w4_t ^ w8_t); + w9_t = SM3_EXPAND(w9_t, w0_t, w6_t, wc_t, w3_t); SM3_ROUND2(d, a, b, c, h, e, f, g, SM3_T37, w5_t, w5_t ^ w9_t); + wa_t = SM3_EXPAND(wa_t, w1_t, w7_t, wd_t, w4_t); SM3_ROUND2(c, d, a, b, g, h, e, f, SM3_T38, w6_t, w6_t ^ wa_t); + wb_t = SM3_EXPAND(wb_t, w2_t, w8_t, we_t, w5_t); SM3_ROUND2(b, c, d, a, f, g, h, e, SM3_T39, w7_t, w7_t ^ wb_t); + wc_t = SM3_EXPAND(wc_t, w3_t, w9_t, wf_t, w6_t); SM3_ROUND2(a, b, c, d, e, f, g, h, SM3_T40, w8_t, w8_t ^ wc_t); + wd_t = SM3_EXPAND(wd_t, w4_t, wa_t, w0_t, w7_t); SM3_ROUND2(d, a, b, c, h, e, f, g, SM3_T41, w9_t, w9_t ^ wd_t); + we_t = SM3_EXPAND(we_t, w5_t, wb_t, w1_t, w8_t); SM3_ROUND2(c, d, a, b, g, h, e, f, SM3_T42, wa_t, wa_t ^ we_t); + wf_t = SM3_EXPAND(wf_t, w6_t, wc_t, w2_t, w9_t); SM3_ROUND2(b, c, d, a, f, g, h, e, SM3_T43, wb_t, wb_t ^ wf_t); + w0_t = SM3_EXPAND(w0_t, w7_t, wd_t, w3_t, wa_t); SM3_ROUND2(a, b, c, d, e, f, g, h, SM3_T44, wc_t, wc_t ^ w0_t); + w1_t = SM3_EXPAND(w1_t, w8_t, we_t, w4_t, wb_t); SM3_ROUND2(d, a, b, c, h, e, f, g, SM3_T45, wd_t, wd_t ^ w1_t); + w2_t = SM3_EXPAND(w2_t, w9_t, wf_t, w5_t, wc_t); SM3_ROUND2(c, d, a, b, g, h, e, f, SM3_T46, we_t, we_t ^ w2_t); + w3_t = SM3_EXPAND(w3_t, wa_t, w0_t, w6_t, wd_t); SM3_ROUND2(b, c, d, a, f, g, h, e, SM3_T47, wf_t, wf_t ^ w3_t); + w4_t = SM3_EXPAND(w4_t, wb_t, w1_t, w7_t, we_t); SM3_ROUND2(a, b, c, d, e, f, g, h, SM3_T48, w0_t, w0_t ^ w4_t); + w5_t = SM3_EXPAND(w5_t, wc_t, w2_t, w8_t, wf_t); SM3_ROUND2(d, a, b, c, h, e, f, g, SM3_T49, w1_t, w1_t ^ w5_t); + w6_t = SM3_EXPAND(w6_t, wd_t, w3_t, w9_t, w0_t); SM3_ROUND2(c, d, a, b, g, h, e, f, SM3_T50, w2_t, w2_t ^ w6_t); + w7_t = SM3_EXPAND(w7_t, we_t, w4_t, wa_t, w1_t); SM3_ROUND2(b, c, d, a, f, g, h, e, SM3_T51, w3_t, w3_t ^ w7_t); + w8_t = SM3_EXPAND(w8_t, wf_t, w5_t, wb_t, w2_t); SM3_ROUND2(a, b, c, d, e, f, g, h, SM3_T52, w4_t, w4_t ^ w8_t); + w9_t = SM3_EXPAND(w9_t, w0_t, w6_t, wc_t, w3_t); SM3_ROUND2(d, a, b, c, h, e, f, g, SM3_T53, w5_t, w5_t ^ w9_t); + wa_t = SM3_EXPAND(wa_t, w1_t, w7_t, wd_t, w4_t); SM3_ROUND2(c, d, a, b, g, h, e, f, SM3_T54, w6_t, w6_t ^ wa_t); + wb_t = SM3_EXPAND(wb_t, w2_t, w8_t, we_t, w5_t); SM3_ROUND2(b, c, d, a, f, g, h, e, SM3_T55, w7_t, w7_t ^ wb_t); + wc_t = SM3_EXPAND(wc_t, w3_t, w9_t, wf_t, w6_t); SM3_ROUND2(a, b, c, d, e, f, g, h, SM3_T56, w8_t, w8_t ^ wc_t); + wd_t = SM3_EXPAND(wd_t, w4_t, wa_t, w0_t, w7_t); SM3_ROUND2(d, a, b, c, h, e, f, g, SM3_T57, w9_t, w9_t ^ wd_t); + we_t = SM3_EXPAND(we_t, w5_t, wb_t, w1_t, w8_t); SM3_ROUND2(c, d, a, b, g, h, e, f, SM3_T58, wa_t, wa_t ^ we_t); + wf_t = SM3_EXPAND(wf_t, w6_t, wc_t, w2_t, w9_t); SM3_ROUND2(b, c, d, a, f, g, h, e, SM3_T59, wb_t, wb_t ^ wf_t); + w0_t = SM3_EXPAND(w0_t, w7_t, wd_t, w3_t, wa_t); SM3_ROUND2(a, b, c, d, e, f, g, h, SM3_T60, wc_t, wc_t ^ w0_t); + w1_t = SM3_EXPAND(w1_t, w8_t, we_t, w4_t, wb_t); SM3_ROUND2(d, a, b, c, h, e, f, g, SM3_T61, wd_t, wd_t ^ w1_t); + w2_t = SM3_EXPAND(w2_t, w9_t, wf_t, w5_t, wc_t); SM3_ROUND2(c, d, a, b, g, h, e, f, SM3_T62, we_t, we_t ^ w2_t); + w3_t = SM3_EXPAND(w3_t, wa_t, w0_t, w6_t, wd_t); SM3_ROUND2(b, c, d, a, f, g, h, e, SM3_T63, wf_t, wf_t ^ w3_t); digest[0] ^= a; digest[1] ^= b; diff --git a/OpenCL/inc_hash_sm3.h b/OpenCL/inc_hash_sm3.h index 466818a78..b78af067b 100644 --- a/OpenCL/inc_hash_sm3.h +++ b/OpenCL/inc_hash_sm3.h @@ -18,60 +18,39 @@ #define SM3_FF1(x, y, z) (((x) & (y)) | (((x) | (y)) & (z))) #define SM3_GG1(x, y, z) (((z) ^ ((x) & ((y) ^ (z))))) -#define SM3_EXPAND_S(a, b, c, d, e) \ - (SM3_P1_S(a ^ b ^ hc_rotl32_S(c, 15)) ^ hc_rotl32_S(d, 7) ^ e) -#define SM3_EXPAND(a, b, c, d, e) \ - (SM3_P1(a ^ b ^ (c, 15)) ^ hc_rotl32(d, 7) ^ e) +#define SM3_EXPAND_S(a, b, c, d, e) (SM3_P1_S(a ^ b ^ hc_rotl32_S(c, 15)) ^ hc_rotl32_S(d, 7) ^ e) +#define SM3_EXPAND(a, b, c, d, e) (SM3_P1(a ^ b ^ (c, 15)) ^ hc_rotl32(d, 7) ^ e) // Only Wj need to be parenthesis because of operator priority // (Wj = Wi ^ Wi+4) -#define SM3_R1_S(a, b, c, d, e, f, g, h, Tj, Wi, Wj) \ -{ \ - const u32 A_ROTL12 = hc_rotl32_S(a, 12); \ - const u32 SS1 = hc_rotl32_S(A_ROTL12 + e + Tj, 7); \ - const u32 TT1 = SM3_FF0(a, b, c) + d + (SS1 ^ A_ROTL12) + (Wj); \ - const u32 TT2 = SM3_GG0(e, f, g) + h + SS1 + Wi; \ - b = hc_rotl32_S(b, 9); \ - d = TT1; \ - f = hc_rotl32_S(f, 19); \ - h = SM3_P0_S(TT2); \ +#define SM3_ROUND_S(a, b, c, d, e, f, g, h, Tj, Wi, Wj, FF, GG) \ +{ \ + const u32 A_ROTL12 = hc_rotl32_S(a, 12); \ + const u32 SS1 = hc_rotl32_S(A_ROTL12 + e + Tj, 7); \ + const u32 TT1 = FF(a, b, c) + d + (SS1 ^ A_ROTL12) + (Wj); \ + const u32 TT2 = GG(e, f, g) + h + SS1 + Wi; \ + b = hc_rotl32_S(b, 9); \ + d = TT1; \ + f = hc_rotl32_S(f, 19); \ + h = SM3_P0_S(TT2); \ } -#define SM3_R1(a, b, c, d, e, f, g, h, Tj, Wi, Wj) \ -{ \ - const u32 A_ROTL12 = hc_rotl32(a, 12); \ - const u32 SS1 = hc_rotl32(A_ROTL12 + e + Tj, 7); \ - const u32 TT1 = SM3_FF0(a, b, c) + d + (SS1 ^ A_ROTL12) + (Wj); \ - const u32 TT2 = SM3_GG0(e, f, g) + h + SS1 + Wi; \ - b = hc_rotl32(b, 9); \ - d = TT1; \ - f = hc_rotl32(f, 19); \ - h = SM3_P0(TT2); \ +#define SM3_ROUND(a, b, c, d, e, f, g, h, Tj, Wi, Wj, FF, GG) \ +{ \ + const u32 A_ROTL12 = hc_rotl32(a, 12); \ + const u32 SS1 = hc_rotl32(A_ROTL12 + e + make_u32x(Tj), 7); \ + const u32 TT1 = FF(a, b, c) + d + (SS1 ^ A_ROTL12) + (Wj); \ + const u32 TT2 = GG(e, f, g) + h + SS1 + Wi; \ + b = hc_rotl32(b, 9); \ + d = TT1; \ + f = hc_rotl32(f, 19); \ + h = SM3_P0(TT2); \ } -#define SM3_R2_S(a, b, c, d, e, f, g, h, Tj, Wi, Wj) \ -{ \ - const u32 A_ROTL12 = hc_rotl32_S(a, 12); \ - const u32 SS1 = hc_rotl32_S(A_ROTL12 + e + Tj, 7); \ - const u32 TT1 = SM3_FF1(a, b, c) + d + (SS1 ^ A_ROTL12) + (Wj); \ - const u32 TT2 = SM3_GG1(e, f, g) + h + SS1 + Wi; \ - b = hc_rotl32_S(b, 9); \ - d = TT1; \ - f = hc_rotl32_S(f, 19); \ - h = SM3_P0_S(TT2); \ -} - -#define SM3_R2(a, b, c, d, e, f, g, h, Tj, Wi, Wj) \ -{ \ - const u32 A_ROTL12 = hc_rotl32(a, 12); \ - const u32 SS1 = hc_rotl32(A_ROTL12 + e + Tj, 7); \ - const u32 TT1 = SM3_FF1(a, b, c) + d + (SS1 ^ A_ROTL12) + (Wj); \ - const u32 TT2 = SM3_GG1(e, f, g) + h + SS1 + Wi; \ - b = hc_rotl32(b, 9); \ - d = TT1; \ - f = hc_rotl32(f, 19); \ - h = SM3_P0(TT2); \ -} +#define SM3_ROUND1_S(a, b, c, d, e, f, g, h, Tj, Wi, Wj) SM3_ROUND_S(a, b, c, d, e, f, g, h, Tj, Wi, Wj, SM3_FF0, SM3_GG0) +#define SM3_ROUND1(a, b, c, d, e, f, g, h, Tj, Wi, Wj) SM3_ROUND(a, b, c, d, e, f, g, h, Tj, Wi, Wj, SM3_FF0, SM3_GG0) +#define SM3_ROUND2_S(a, b, c, d, e, f, g, h, Tj, Wi, Wj) SM3_ROUND_S(a, b, c, d, e, f, g, h, Tj, Wi, Wj, SM3_FF1, SM3_GG1) +#define SM3_ROUND2(a, b, c, d, e, f, g, h, Tj, Wi, Wj) SM3_ROUND(a, b, c, d, e, f, g, h, Tj, Wi, Wj, SM3_FF1, SM3_GG1) typedef struct sm3_ctx {